POI讀取第三方下載的Word文檔

由於從第三方讀取到的word多是其餘格式（例如：html）轉成word的，此時去讀取word可能會失敗。這裏以HTML爲例html

依賴

<!-- parse world -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>

代碼片斷

private String parseWord(String path) throws ParseWordException {
        // inspect
        if (isEmpty(path)) {
            throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
        }

        // reader
        File file = new File(path);
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(file);
        }  catch (FileNotFoundException e) {
            throw new ParseWordException(Code.READER_FILE_FAILURE.getCode(), Code.READER_FILE_FAILURE.getMessage());
        }


        // parse

        try {
            if (path.toUpperCase().endsWith(FileType.DOC.toString())) {
                HWPFDocument wordDoc = new HWPFDocument(fis);
                // 本身讀

            } else if (path.toUpperCase().endsWith(FileType.DOCX.toString())) {
                XWPFDocument wordDocx = new XWPFDocument(fis);
                // 本身讀

            } else {
                // 文件格式非法
                throw new ParseWordException(Code.FILE_TYPE_ILLEGAL.getCode(), Code.FILE_TYPE_ILLEGAL.getMessage());
            }

        }
        catch (IllegalArgumentException ie) {
            System.out.println(ie.getMessage());
            if (isEmpty(ie.getMessage())) {
                throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
            }
            if (ie.getMessage().contains("The document is really a HTML file")) {
                // 格式轉換
                try {
                    String htmlPath = parseHtml(file);
                    Document doc = Jsoup.parse(new File(htmlPath), "GBK"); // 本身定
                    Elements elements = doc.select("table").select("tbody"); //讀取全部的tbody標籤，視狀況而定
                    elements.forEach(e -> {
                        //讀取td中全部的span標籤，視狀況而定，可能有圖片，本身處理
                        e.select("td").select("span").eachText().stream().filter(d -> d != null && d.trim().length() > 0).forEach(System.out::println);

                    });


                } catch (IOException e) {
                    throw new ParseWordException(Code.FILE_CONVERT_FAILURE.getCode(), Code.FILE_CONVERT_FAILURE.getMessage());
                }
            }


        }
        catch (IOException e) {
            throw new ParseWordException(Code.PARSE_FAILURE.getCode(), Code.PARSE_FAILURE.getMessage());
        }
        return null;
    }

    /**
     * parse HTML
     *
     * @param readerFile
     * @return
     * @throws IOException
     */
    private String parseHtml(File readerFile) throws IOException {
        String tempPath = "d:\\1.html"; // 建立一個零時文件，本身換一下路徑

        File outFile = new File(tempPath);
        if (outFile.exists()) {
            outFile.delete(); // 刪掉以前已經存在的文件
        }
        FileInputStream fis = new FileInputStream(readerFile);
        FileOutputStream fileOutputStream = new FileOutputStream(outFile);
        int len = 0;
        byte[] buffer = new byte[1024];
        while ((len = fis.read(buffer)) != -1) {
            fileOutputStream.write(buffer, 0, len);

        }

        return tempPath;
    }

    public static void main(String[] args) throws IOException, ParseWordException {
       ParseWorld parse = new ParseWorld();
       parse.parseWord("D:\\aaa.doc");


//
    }

    private boolean isEmpty(String str) {
        return str == null || str.trim().length() == 0;
    }