.doc文件
- 代码中的WordParagraph类是自己创建的实体类,用于记录文本信息和图片
- file_word对象为前台上传的MultipartFile对象.
// doc格式
List<WordParagraph> wordParagraphs=new ArrayList<>();
HWPFDocument doc = new HWPFDocument(file_word.getInputStream());
Range range = doc.getRange();
int numP = range.numParagraphs();
//StringBuffer ret = new StringBuffer();
for (int i = 0; i < numP; ++i) {
//从每一段落中获取文字
Paragraph p = range.getParagraph(i);
//ret.append(p.text());
WordParagraph wordParagraph=new WordParagraph(p.text(),i);
wordParagraph.init();
wordParagraphs.add(wordParagraph);
}
// List<Picture> pictsList = new ArrayList();
// 得到文档的数据流
byte[] dataStream = doc.getDataStream();
int numChar = range.numCharacterRuns();
Integer paragraphOrder=0;
PicturesTable pTable = new PicturesTable(doc, dataStream, new byte[1024]);
for (int j = 0; j < numChar; ++j) {
CharacterRun cRun = range.getCharacterRun(j);
boolean has = pTable.hasPicture(cRun);
String[] temp_array=(cRun.toString()+" ").split("\r");
paragraphOrder=paragraphOrder+temp_array.length-1;
if (has) {
Picture picture = pTable.extractPicture(cRun, true);
if(paragraphOrder<wordParagraphs.size()){
wordParagraphs.get(paragraphOrder).addPictures(picture);
}
}
}
- 当前处理方式存在一些局限性
- 无法确定图片在段落的具体位置信息.(该问题可通过对文档流处理的优化解决)
- 如果一个段落有多个图片,可能只解析一个
.docx
- 网上对标签解析的方式,实测不可行.poi实际有提供获取的方法.
- 同样的,无法获取图片在段落的具体位置
- .doc和.docx读取的图片类不是同一个,兼容的时候需要注意.
// docx格式
XWPFDocument document=new XWPFDocument(file_word.getInputStream());
List<XWPFParagraph> XWPFParagraphList=document.getParagraphs();
//List<XWPFPictureData> picList = document.getAllPictures();
for (int i = 0; i < XWPFParagraphList.size(); ++i) {
//从每一段落中获取文字
XWPFParagraph p = XWPFParagraphList.get(i);
WordParagraph wordParagraph=new WordParagraph(p.getParagraphText(),i);
List<XWPFPictureData> pictureList=readImageInfoInParagraph(p);
wordParagraph.setPictures(pictureList);
wordParagraph.init();
wordParagraphs.add(wordParagraph);
}
//获取某一个段落中的所有图片
public static List<XWPFPictureData> readImageInfoInParagraph(XWPFParagraph paragraph) {
List<XWPFPictureData> res=new ArrayList<>();
//段落中所有XWPFRun
List<XWPFRun> runList = paragraph.getRuns();
for (XWPFRun run : runList) {
List<XWPFPicture> pictures=run.getEmbeddedPictures();
for(int i=0;i<pictures.size();i++){
res.add(pictures.get(i).getPictureData());
}
}
return res;
}