word文檔處理成富文本生成sql語句導入mysql

時間 2019-11-12

標籤 word 文檔處理文本生成 sql 語句導入 mysql 欄目 Microsoft Office 简体版

原文原文鏈接

問題：須要將大量的已存在的word文檔導入到web項目裏在網站展現，不可能經過編輯錄入的方式處理，經過程序實現。
解決思路：經過讀取word文檔處理成html，再獲取html富文本內容，拼接成sql，導入數據庫。html

要點
1：讀取word文件夾會遞歸讀取，只要文件夾下有word文檔便可，程序中有過濾word文檔的代碼，可根據須要修改；
二、可同時處理word2003和word2007+版本的word文檔；
三、讀取word2007生成的html文檔內中文是unicode編碼的，放到數據庫或瀏覽器直接打開，不影響頁面顯示；
四、對word文檔中圖片作了處理，存儲到單獨的文件夾，導入mysql或其它數據庫後，正確顯示圖片，要注意路徑處理；
五、只處理word2007文檔可不生成html直接獲取富文本內容，但2003版本不能夠，所以統一將doc文檔都生成html頁面再用java讀取html文檔獲取body元素下的富文本內容。java

不足之處，歡迎交流和指正。mysql

可能用到的pom以下：web

<!--poi處理word文檔-->
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>fr.opensagres.xdocreport.document</artifactId>
			<version>2.0.1</version>
		</dependency>
		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
			<version>1.0.6</version>
		</dependency>

		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.core</artifactId>
			<version>1.0.6</version>
		</dependency>

		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
			<version>1.0.6</version>
		</dependency>

完整的java代碼以下：sql

package test;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * @Desc word轉化爲html
 * @Author ls 2019/3/19 0019 11:21
 */
public class Word2Html {


    @Test
    public void handleWordToSql() {
        String path = "F:\\word文檔\\";
        List<String> fileNames = new ArrayList<>();
        Map<String, String> contentsMap = new HashMap<>();

        getAllFileName(path, fileNames);
//        fileNames.forEach(System.out::println);
//        System.out.println(fileNames.size());
        Map<String, String> map = handleFileName(fileNames);
        String imagePath = "F:\\images\\";
        String htmlPath = "F:\\html\\";
        map.forEach((k, v) -> {
            String content = "";
            String articleName = k.substring(0, k.lastIndexOf("."));
            String htmlName = articleName + ".html";
            if (k.contains(".doc") && !k.contains(".docx")) {
                try {
                    content = word2003ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName);
                } catch (TransformerException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                } catch (ParserConfigurationException e) {
                    e.printStackTrace();
                }

            } else if (k.contains(".docx")) {
                try {
                    content = word2007ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName);
                } catch (IOException e) {
                    e.printStackTrace();
                }

            } else {
                System.out.println("word文檔格式不正確->" + k);
            }

            if (StringUtils.isNotBlank(content)) {
                contentsMap.put(articleName, content);
            }

        });
        System.out.println("文章總條數: " + contentsMap.size());
//        contents.forEach(System.out::println);
        handleInsertSql(contentsMap);

    }

    /**
     * 處理sql
     *
     * @param contentsMap
     * @return
     */
    public String handleInsertSql(Map<String, String> contentsMap) {
        StringBuffer sb = new StringBuffer();
        int id = 161;
        for (String k : contentsMap.keySet()) {
            sb.append("INSERT INTO `article` VALUES (" + id + ", '2019-03-19 14:24:56', '2019-03-19 10:53:56', '0', '網絡文章', \"" + contentsMap.get(k).replace("\"", "\\\'") + "\", '0', '', '', null, null, null, '" + k + "', '1', null, '0', null)");
            sb.append(" ;");
            sb.append("\r\n");
            id++;
        }

        String data = sb.toString();
        data = data.replace("F:\\images\\", "/images/");
        writeFile(new StringBuffer(data));
//        System.out.println(data);
        return data;
    }

    /**
     * 路徑處理爲map key-> word文件名 value-> 全路徑
     *
     * @param list
     * @return
     */
    public Map<String, String> handleFileName(List<String> list) {
        if (list.size() == 0) {
            System.out.println("沒有須要處理的文件！");
        }

        // 過濾非word文檔路徑
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            if (str.contains(".doc") || str.contains(".docx")) {

            } else {
                list.remove(str);
                i--;
            }

        }

        Map<String, String> map = new HashMap<>();
        for (String path : list) {
            if (StringUtils.isNotBlank(path)) {
                String[] arr = path.split("\\\\");
                for (String value : arr) {
                    if (value.contains(".doc") || value.contains(".docx")) {
                        // 文件名 全路徑
                        map.put(value, path);
                    }
                }
            }
        }
        return map;
    }


    /**
     * 獲取全部文件夾及文件
     *
     * @param path
     * @param listFileName
     */
    public void getAllFileName(String path, List<String> listFileName) {
        File file = new File(path);
        File[] files = file.listFiles();
        String[] names = file.list();
        if (names != null) {
            String[] completNames = new String[names.length];
            for (int i = 0; i < names.length; i++) {
//                if(path.contains(".doc") || path.contains(".docx"))
                completNames[i] = path + names[i];
            }
            listFileName.addAll(Arrays.asList(completNames));
        }
        for (File a : files) {
            if (a.isDirectory()) {
                //若是文件夾下有子文件夾，獲取子文件夾下的全部文件全路徑。
                getAllFileName(a.getAbsolutePath() + "\\", listFileName);
            }
        }
    }

    /**
     * word2003轉換
     *
     * @param imgPath
     * @param fileName
     * @param outPutFile
     */
    public String word2003ToHtml(String imgPath, String fileName, String outPutFile)
            throws TransformerException, IOException,
            ParserConfigurationException {
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> suggestedName);
        wordToHtmlConverter.processDocument(wordDocument);
        //save pictures
        List pics = wordDocument.getPicturesTable().getAllPictures();
        if (pics != null) {
            for (int i = 0; i < pics.size(); i++) {
                Picture pic = (Picture) pics.get(i);
                System.out.println();
                try {
                    pic.writeImageContent(new FileOutputStream(imgPath
                            + pic.suggestFullFileName()));
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                }
            }
        }
        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();
        writeFile(new String(out.toByteArray()), outPutFile);
        String body = readHtml(new File((outPutFile)));
//        System.out.println(out.toString());
        body = replaceBreak(body);
//        System.out.println(body);
        return body;
    }

    /**
     * word2007轉html
     *
     * @throws IOException
     */
    public String word2007ToHtml(String imgPath, String fileName, String htmlName) throws IOException {
        File f = new File(fileName);
        String content = "";
        if (!f.exists()) {
            System.out.println("Sorry File does not Exists!");
        } else {
            if (f.getName().endsWith(".docx") || f.getName().endsWith(".docx")) {

                // 1) 加載word文檔生成 XWPFDocument對象
                InputStream in = new FileInputStream(f);
                if (in.available() == 0) {
                    return content;
                }
                XWPFDocument document = new XWPFDocument(in);

                // 2) 解析 XHTML配置 (這裏設置IURIResolver來設置圖片存放的目錄)
                File imageFolderFile = new File(imgPath);
                XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
                options.setExtractor(new FileImageExtractor(imageFolderFile));
                options.setIgnoreStylesIfUnused(false);
                options.setFragment(true);

                // 3) 將 XWPFDocument轉換成XHTML
                OutputStream out = new FileOutputStream(new File(htmlName));
                XHTMLConverter.getInstance().convert(document, out, options);

                //也可使用字符數組流獲取解析的內容
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                XHTMLConverter.getInstance().convert(document, baos, options);
                if (baos.size() == 0) {
                    return content;
                }
                content = baos.toString();
                // 輸出轉化後的文本
                // System.out.println(content);
                baos.close();
            } else {
                System.out.println("Enter only MS Office 2007+ files");
            }
        }

        return content;
    }

    /**
     * 解析html文件 得到body內容
     *
     * @param file
     * @return
     */
    public String readHtml(File file) {
        String body = "";
        try {
            FileInputStream iStream = new FileInputStream(file);
            Reader reader = new InputStreamReader(iStream);
            BufferedReader htmlReader = new BufferedReader(reader);

            String line;
            boolean found = false;
            while (!found && (line = htmlReader.readLine()) != null) {
                if (line.toLowerCase().indexOf("<body") != -1) { // 在<body>的前面可能存在空格
                    found = true;
                }
            }

            found = false;
            while (!found && (line = htmlReader.readLine()) != null) {
                if (line.toLowerCase().indexOf("</body") != -1) {
                    found = true;
                } else {
                    // 若是存在圖片，則將相對路徑轉換爲絕對路徑
                    String lowerCaseLine = line.toLowerCase();
                    if (lowerCaseLine.contains("src")) {
                        //這裏是定義圖片的訪問路徑
                        String directory = "D:/test";
                        // 若是該行存在多個<img>元素，則分行進行替代
                        String[] splitLines = line.split("<img\\s+"); // <img後帶一個或多個空格
                        // 由於java中引用的問題不能使用for each
                        for (int i = 0; i < splitLines.length; i++) {
                            if (splitLines[i].toLowerCase().startsWith("src")) {
                                splitLines[i] = splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5)
                                        + directory
                                        + splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5);
                            }
                        }

                        // 最後進行拼接
                        line = "";
                        for (int i = 0; i < splitLines.length - 1; i++) { // 循環次數要-1，由於最後一個字符串後不須要添加<img
                            line = line + splitLines[i] + "<img ";
                        }
                        line = line + splitLines[splitLines.length - 1];
                    }

                    body = body + line + "\n";
                }
            }
            htmlReader.close();
            //        System.out.println(body);

        } catch (Exception e) {
            e.printStackTrace();
        }
        return body;
    }

    /**
     * 去掉換行
     *
     * @param str
     * @return
     */
    public String replaceBreak(String str) {
        String dest = "";
        if (str != null) {
            Pattern p = Pattern.compile("\\t|\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("");
        }
        return dest;
    }

    /**
     * sql寫入文件
     * @param content
     * @param path
     */
    public static void writeFile(String content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8"));
            bw.write(content);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
            }
        }
    }

    /**
     * sql寫入文件
     */
    public static void writeFile(StringBuffer sb) {
        String sqlTxtFile = "F:\\articleSql.sql";
        try {
            File writeName = new File(sqlTxtFile); // 相對路徑，若是沒有則要創建一個新的output.txt文件
            writeName.createNewFile(); // 建立新文件,有同名的文件的話直接覆蓋
            try (FileWriter writer = new FileWriter(writeName);
                 BufferedWriter out = new BufferedWriter(writer)
            ) {
                out.write(sb.toString()); // \r\n即爲換行
                out.flush(); // 把緩存區內容壓入文件
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("sql文件寫入完成");
    }

    // 生成6位隨機數
    public static String randomCode(){
        int num = (int)((Math.random()*9+1)*100000);
        return String.valueOf(num);
    }

    /**
     * 生成隨機字符串 時間戳_6位隨機數
     * @return
     */
    public static String generateImageName(){
        String name = String.valueOf(System.currentTimeMillis()) + "_" + randomCode();
        return name;
    }

}

運行handleWordToSql測試方法便可！數據庫

效果圖：
原word文檔
apache