由於從第三方讀取到的word多是其餘格式(例如:html)轉成word的,此時去讀取word可能會失敗。這裏以HTML爲例html
依賴
<!-- parse world --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>4.0.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.0.1</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency>
代碼片斷
private String parseWord(String path) throws ParseWordException { // inspect if (isEmpty(path)) { throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage()); } // reader File file = new File(path); FileInputStream fis = null; try { fis = new FileInputStream(file); } catch (FileNotFoundException e) { throw new ParseWordException(Code.READER_FILE_FAILURE.getCode(), Code.READER_FILE_FAILURE.getMessage()); } // parse try { if (path.toUpperCase().endsWith(FileType.DOC.toString())) { HWPFDocument wordDoc = new HWPFDocument(fis); // 本身讀 } else if (path.toUpperCase().endsWith(FileType.DOCX.toString())) { XWPFDocument wordDocx = new XWPFDocument(fis); // 本身讀 } else { // 文件格式非法 throw new ParseWordException(Code.FILE_TYPE_ILLEGAL.getCode(), Code.FILE_TYPE_ILLEGAL.getMessage()); } } catch (IllegalArgumentException ie) { System.out.println(ie.getMessage()); if (isEmpty(ie.getMessage())) { throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage()); } if (ie.getMessage().contains("The document is really a HTML file")) { // 格式轉換 try { String htmlPath = parseHtml(file); Document doc = Jsoup.parse(new File(htmlPath), "GBK"); // 本身定 Elements elements = doc.select("table").select("tbody"); //讀取全部的tbody標籤,視狀況而定 elements.forEach(e -> { //讀取td中全部的span標籤,視狀況而定,可能有圖片,本身處理 e.select("td").select("span").eachText().stream().filter(d -> d != null && d.trim().length() > 0).forEach(System.out::println); }); } catch (IOException e) { throw new ParseWordException(Code.FILE_CONVERT_FAILURE.getCode(), Code.FILE_CONVERT_FAILURE.getMessage()); } } } catch (IOException e) { throw new ParseWordException(Code.PARSE_FAILURE.getCode(), Code.PARSE_FAILURE.getMessage()); } return null; } /** * parse HTML * * @param readerFile * @return * @throws IOException */ private String parseHtml(File readerFile) throws IOException { String tempPath = "d:\\1.html"; // 建立一個零時文件,本身換一下路徑 File outFile = new File(tempPath); if (outFile.exists()) { outFile.delete(); // 刪掉以前已經存在的文件 } FileInputStream fis = new FileInputStream(readerFile); FileOutputStream fileOutputStream = new FileOutputStream(outFile); int len = 0; byte[] buffer = new byte[1024]; while ((len = fis.read(buffer)) != -1) { fileOutputStream.write(buffer, 0, len); } return tempPath; } public static void main(String[] args) throws IOException, ParseWordException { ParseWorld parse = new ParseWorld(); parse.parseWord("D:\\aaa.doc"); // } private boolean isEmpty(String str) { return str == null || str.trim().length() == 0; }