1500字范文,内容丰富有趣,写作好帮手!
1500字范文 > POI读取doc docx文件

POI读取doc docx文件

时间:2022-01-26 05:56:16

相关推荐

POI读取doc docx文件

1.明确几个概念:

Range:它表示一个范围,这个范围可以是整个文档,也可以是里面的某一小节(Section),也可以是某一个段落(Paragraph),还可以是拥有共同属性的一段文本(CharacterRun)。

Section:word文档的一个小节,一个word文档可以由多个小节构成。Paragraph:word文档的一个段落,一个小节可以由多个段落构成。

CharacterRun:具有相同属性的一段文本,一个段落可以由多个CharacterRun组成。

Table:一个表格。

ableRow:表格对应的行。

TableCell:表格对应的单元格。

2.依赖包:

<!-- POI依赖,读取.docx型文档--><!-- /artifact/org.apache.poi/poi-ooxml --><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>4.1.1</version></dependency><!-- POI依赖,读取.doc型文档--><!-- /artifact/org.apache.poi/poi-scratchpad --><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>4.1.1</version></dependency>

3.读取doc型文档

1)使用HWPFDocument 读取

import org.apache.poi.hpsf.DocumentSummaryInformation;import org.apache.poi.hpsf.SummaryInformation;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hwpf.usermodel.*;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.apache.poi.xwpf.usermodel.XWPFPictureData;import org.junit.Test;import java.io.*;import java.util.List;public class WordTest {@Testpublic void testReadByDoc() throws Exception {InputStream is = new FileInputStream("C:\\Users\\阿劼\\Desktop\\11.doc");HWPFDocument doc = new HWPFDocument(is);// 输出文本,这步读取不到????System.out.println("=========================文本信息==========================");System.out.println("-------------使用getDocumentText()获取文本---------------");System.out.println(doc.getDocumentText());System.out.println("-----------------使用getText()获取文本-------------------");System.out.println(doc.getText());// 输出书签信息this.printInfo(doc.getBookmarks());Range range = doc.getRange();// range信息this.printInfo(range);// 读表格this.readTable(range);// 读列表this.readList(range);this.closeStream(is);}/*** 关闭输入流** @param is*/private void closeStream(InputStream is) {if (is != null) {try {is.close();} catch (IOException e) {e.printStackTrace();}}}/*** 输出书签信息** @param bookmarks*/private void printInfo(Bookmarks bookmarks) {int count = bookmarks.getBookmarksCount();System.out.println("=========================书签信息==========================");System.out.println("书签数量:" + count);Bookmark bookmark;for (int i = 0; i < count; i++) {bookmark = bookmarks.getBookmark(i);System.out.println("书签" + (i + 1) + "的名称是:" + bookmark.getName());System.out.println("开始位置:" + bookmark.getStart());System.out.println("结束位置:" + bookmark.getEnd());}}private void readTable(Range range) {System.out.println("=========================表格信息==========================");//遍历range范围内的table。TableIterator tableIter = new TableIterator(range);while (tableIter.hasNext()) {Table table = tableIter.next();//开始位置int start = table.getStartOffset();//结束位置int end = table.getEndOffset();System.out.printf("开始位置%d,结束位置%d\r\n", start, end);//获取行的数目int rowNum = table.numRows();for (int j = 0; j < rowNum; j++) {//获取每一行TableRow row = table.getRow(j);int cellNum = row.numCells();for (int k = 0; k < cellNum; k++) {//获取每一列TableCell cell = row.getCell(k);// 输出单元格的文本System.out.printf("第%d行第%d列的内容是: %s", j + 1, k + 1, cell.text().trim());System.out.println();}}}}/*** 读列表** @param range*/private void readList(Range range) {System.out.println("=========================列表信息==========================");int num = range.numParagraphs();for (int i = 0; i < num; i++) {Paragraph paragraph = range.getParagraph(i);if (paragraph.isInList()) {System.out.println("list : " + paragraph.text());}}}/*** 输出Range** @param range*/private void printInfo(Range range) {System.out.println("=========================Range信息==========================");System.out.println("-------------------------段落信息-------------------------");// 获取段落数int paraNum = range.numParagraphs();System.out.println("段落数为 : " + paraNum);for (int i = 0; i < paraNum; i++) {System.out.println("段落" + (i + 1) + "内容为:" + range.getParagraph(i).text());}System.out.println("-------------------------小节信息-------------------------");int secNum = range.numSections();System.out.println("小节数为 : " + paraNum);System.out.println(secNum);Section section;for (int i = 0; i < secNum; i++) {section = range.getSection(i);System.out.println(section.text());}}}

2)使用WordExtractor 读取

import org.apache.poi.hpsf.DocumentSummaryInformation;import org.apache.poi.hpsf.SummaryInformation;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hwpf.usermodel.*;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.apache.poi.xwpf.usermodel.XWPFPictureData;import org.junit.Test;import java.io.*;import java.util.List;public class WordTest {@Testpublic void readByExtractorTest() throws Exception {InputStream is = new FileInputStream("C:\\Users\\阿劼\\Desktop\\11.doc");WordExtractor extractor = new WordExtractor(is);// word文档所有的文本System.out.println("---------------文档中所有文本----------------");System.out.println(extractor.getText());// 页眉System.out.println("-------------------页眉-----------------");System.out.println(extractor.getHeaderText());// 页脚System.out.println("------------------页脚------------------");System.out.println(extractor.getFooterText());// 输出当前word文档的元数据信息,包括作者、文档的修改时间等。System.out.println("------------------元数据信息-------------------");System.out.println(extractor.getMetadataTextExtractor().getText());// 获取各个段落的文本System.out.println("=======================每个段落信息=========================");String paraTexts[] = extractor.getParagraphText();for (int i = 0; i < paraTexts.length; i++) {System.out.println("------------------段落" + (i + 1) + "----------------");System.out.println("Paragraph " + (i + 1) + " : " + paraTexts[i]);}// 当前word的一些信息printInfo(extractor.getSummaryInformation());// 当前word的一些信息this.printInfo(extractor.getDocSummaryInformation());this.closeStream(is);}/*** 输出SummaryInfomation** @param info*/private void printInfo(SummaryInformation info) {System.out.println("===================从getSummaryInformation中获取信息===============");// 作者System.out.println("---------------------作者----------------------");System.out.println(info.getAuthor());// 字符统计System.out.println("---------------------字符----------------------");System.out.println(info.getCharCount());// 页数System.out.println("---------------------页数----------------------");System.out.println(info.getPageCount());// 标题System.out.println("---------------------标题----------------------");System.out.println(info.getTitle());// 主题System.out.println("---------------------主题----------------------");System.out.println(info.getSubject());}/*** 输出DocumentSummaryInfomation** @param info*/private void printInfo(DocumentSummaryInformation info) {System.out.println("===================从getDocSummaryInformation中获取信息===============");// 分类System.out.println("---------------------分类----------------------");System.out.println(info.getCategory());// 公司System.out.println("---------------------公司----------------------");System.out.println(info.getCompany());}}

4.读取docx型文件

import org.apache.poi.hpsf.DocumentSummaryInformation;import org.apache.poi.hpsf.SummaryInformation;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hwpf.usermodel.*;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.apache.poi.xwpf.usermodel.XWPFPictureData;import org.junit.Test;import java.io.*;import java.util.List;public class WordTest {@Testpublic void poiReadDocxTest() {File file = new File("C:\\Users\\阿劼\\Desktop\\0.docx");try {FileInputStream fis = new FileInputStream(file);XWPFDocument xdoc = new XWPFDocument(fis);XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);String doc1 = extractor.getText();System.out.println(doc1);List<XWPFPictureData> allPictures = xdoc.getAllPictures();fis.close();} catch (Exception e) {e.printStackTrace();}}}

感谢/article/101910.htm

和/Renyi-Fan/p/8147650.html两篇文章

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。