問題:須要將大量的已存在的word文檔導入到web項目裏在網站展現,不可能經過編輯錄入的方式處理,經過程序實現。
解決思路:經過讀取word文檔處理成html,再獲取html富文本內容,拼接成sql,導入數據庫。html
要點
1:讀取word文件夾會遞歸讀取,只要文件夾下有word文檔便可,程序中有過濾word文檔的代碼,可根據須要修改;
二、可同時處理word2003和word2007+版本的word文檔;
三、讀取word2007生成的html文檔內中文是unicode編碼的,放到數據庫或瀏覽器直接打開,不影響頁面顯示;
四、對word文檔中圖片作了處理,存儲到單獨的文件夾,導入mysql或其它數據庫後,正確顯示圖片,要注意路徑處理;
五、只處理word2007文檔可不生成html直接獲取富文本內容,但2003版本不能夠,所以統一將doc文檔都生成html頁面再用java讀取html文檔獲取body元素下的富文本內容。java
不足之處,歡迎交流和指正。mysql
可能用到的pom以下:web
<!--poi處理word文檔--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.15</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.15</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.xdocreport.document</artifactId> <version>2.0.1</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.core</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.pdf</artifactId> <version>1.0.6</version> </dependency>
完整的java代碼以下:sql
package test; import org.apache.commons.lang.StringUtils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.core.FileURIResolver; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.junit.Test; import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /* * @Desc word轉化爲html * @Author ls 2019/3/19 0019 11:21 */ public class Word2Html { @Test public void handleWordToSql() { String path = "F:\\word文檔\\"; List<String> fileNames = new ArrayList<>(); Map<String, String> contentsMap = new HashMap<>(); getAllFileName(path, fileNames); // fileNames.forEach(System.out::println); // System.out.println(fileNames.size()); Map<String, String> map = handleFileName(fileNames); String imagePath = "F:\\images\\"; String htmlPath = "F:\\html\\"; map.forEach((k, v) -> { String content = ""; String articleName = k.substring(0, k.lastIndexOf(".")); String htmlName = articleName + ".html"; if (k.contains(".doc") && !k.contains(".docx")) { try { content = word2003ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName); } catch (TransformerException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } } else if (k.contains(".docx")) { try { content = word2007ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName); } catch (IOException e) { e.printStackTrace(); } } else { System.out.println("word文檔格式不正確->" + k); } if (StringUtils.isNotBlank(content)) { contentsMap.put(articleName, content); } }); System.out.println("文章總條數: " + contentsMap.size()); // contents.forEach(System.out::println); handleInsertSql(contentsMap); } /** * 處理sql * * @param contentsMap * @return */ public String handleInsertSql(Map<String, String> contentsMap) { StringBuffer sb = new StringBuffer(); int id = 161; for (String k : contentsMap.keySet()) { sb.append("INSERT INTO `article` VALUES (" + id + ", '2019-03-19 14:24:56', '2019-03-19 10:53:56', '0', '網絡文章', \"" + contentsMap.get(k).replace("\"", "\\\'") + "\", '0', '', '', null, null, null, '" + k + "', '1', null, '0', null)"); sb.append(" ;"); sb.append("\r\n"); id++; } String data = sb.toString(); data = data.replace("F:\\images\\", "/images/"); writeFile(new StringBuffer(data)); // System.out.println(data); return data; } /** * 路徑處理爲map key-> word文件名 value-> 全路徑 * * @param list * @return */ public Map<String, String> handleFileName(List<String> list) { if (list.size() == 0) { System.out.println("沒有須要處理的文件!"); } // 過濾非word文檔路徑 for (int i = 0; i < list.size(); i++) { String str = list.get(i); if (str.contains(".doc") || str.contains(".docx")) { } else { list.remove(str); i--; } } Map<String, String> map = new HashMap<>(); for (String path : list) { if (StringUtils.isNotBlank(path)) { String[] arr = path.split("\\\\"); for (String value : arr) { if (value.contains(".doc") || value.contains(".docx")) { // 文件名 全路徑 map.put(value, path); } } } } return map; } /** * 獲取全部文件夾及文件 * * @param path * @param listFileName */ public void getAllFileName(String path, List<String> listFileName) { File file = new File(path); File[] files = file.listFiles(); String[] names = file.list(); if (names != null) { String[] completNames = new String[names.length]; for (int i = 0; i < names.length; i++) { // if(path.contains(".doc") || path.contains(".docx")) completNames[i] = path + names[i]; } listFileName.addAll(Arrays.asList(completNames)); } for (File a : files) { if (a.isDirectory()) { //若是文件夾下有子文件夾,獲取子文件夾下的全部文件全路徑。 getAllFileName(a.getAbsolutePath() + "\\", listFileName); } } } /** * word2003轉換 * * @param imgPath * @param fileName * @param outPutFile */ public String word2003ToHtml(String imgPath, String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException { HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> suggestedName); wordToHtmlConverter.processDocument(wordDocument); //save pictures List pics = wordDocument.getPicturesTable().getAllPictures(); if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); System.out.println(); try { pic.writeImageContent(new FileOutputStream(imgPath + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); writeFile(new String(out.toByteArray()), outPutFile); String body = readHtml(new File((outPutFile))); // System.out.println(out.toString()); body = replaceBreak(body); // System.out.println(body); return body; } /** * word2007轉html * * @throws IOException */ public String word2007ToHtml(String imgPath, String fileName, String htmlName) throws IOException { File f = new File(fileName); String content = ""; if (!f.exists()) { System.out.println("Sorry File does not Exists!"); } else { if (f.getName().endsWith(".docx") || f.getName().endsWith(".docx")) { // 1) 加載word文檔生成 XWPFDocument對象 InputStream in = new FileInputStream(f); if (in.available() == 0) { return content; } XWPFDocument document = new XWPFDocument(in); // 2) 解析 XHTML配置 (這裏設置IURIResolver來設置圖片存放的目錄) File imageFolderFile = new File(imgPath); XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); options.setExtractor(new FileImageExtractor(imageFolderFile)); options.setIgnoreStylesIfUnused(false); options.setFragment(true); // 3) 將 XWPFDocument轉換成XHTML OutputStream out = new FileOutputStream(new File(htmlName)); XHTMLConverter.getInstance().convert(document, out, options); //也可使用字符數組流獲取解析的內容 ByteArrayOutputStream baos = new ByteArrayOutputStream(); XHTMLConverter.getInstance().convert(document, baos, options); if (baos.size() == 0) { return content; } content = baos.toString(); // 輸出轉化後的文本 // System.out.println(content); baos.close(); } else { System.out.println("Enter only MS Office 2007+ files"); } } return content; } /** * 解析html文件 得到body內容 * * @param file * @return */ public String readHtml(File file) { String body = ""; try { FileInputStream iStream = new FileInputStream(file); Reader reader = new InputStreamReader(iStream); BufferedReader htmlReader = new BufferedReader(reader); String line; boolean found = false; while (!found && (line = htmlReader.readLine()) != null) { if (line.toLowerCase().indexOf("<body") != -1) { // 在<body>的前面可能存在空格 found = true; } } found = false; while (!found && (line = htmlReader.readLine()) != null) { if (line.toLowerCase().indexOf("</body") != -1) { found = true; } else { // 若是存在圖片,則將相對路徑轉換爲絕對路徑 String lowerCaseLine = line.toLowerCase(); if (lowerCaseLine.contains("src")) { //這裏是定義圖片的訪問路徑 String directory = "D:/test"; // 若是該行存在多個<img>元素,則分行進行替代 String[] splitLines = line.split("<img\\s+"); // <img後帶一個或多個空格 // 由於java中引用的問題不能使用for each for (int i = 0; i < splitLines.length; i++) { if (splitLines[i].toLowerCase().startsWith("src")) { splitLines[i] = splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5) + directory + splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5); } } // 最後進行拼接 line = ""; for (int i = 0; i < splitLines.length - 1; i++) { // 循環次數要-1,由於最後一個字符串後不須要添加<img line = line + splitLines[i] + "<img "; } line = line + splitLines[splitLines.length - 1]; } body = body + line + "\n"; } } htmlReader.close(); // System.out.println(body); } catch (Exception e) { e.printStackTrace(); } return body; } /** * 去掉換行 * * @param str * @return */ public String replaceBreak(String str) { String dest = ""; if (str != null) { Pattern p = Pattern.compile("\\t|\n"); Matcher m = p.matcher(str); dest = m.replaceAll(""); } return dest; } /** * sql寫入文件 * @param content * @param path */ public static void writeFile(String content, String path) { FileOutputStream fos = null; BufferedWriter bw = null; try { File file = new File(path); fos = new FileOutputStream(file); bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8")); bw.write(content); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (bw != null) bw.close(); if (fos != null) fos.close(); } catch (IOException ie) { } } } /** * sql寫入文件 */ public static void writeFile(StringBuffer sb) { String sqlTxtFile = "F:\\articleSql.sql"; try { File writeName = new File(sqlTxtFile); // 相對路徑,若是沒有則要創建一個新的output.txt文件 writeName.createNewFile(); // 建立新文件,有同名的文件的話直接覆蓋 try (FileWriter writer = new FileWriter(writeName); BufferedWriter out = new BufferedWriter(writer) ) { out.write(sb.toString()); // \r\n即爲換行 out.flush(); // 把緩存區內容壓入文件 } } catch (IOException e) { e.printStackTrace(); } System.out.println("sql文件寫入完成"); } // 生成6位隨機數 public static String randomCode(){ int num = (int)((Math.random()*9+1)*100000); return String.valueOf(num); } /** * 生成隨機字符串 時間戳_6位隨機數 * @return */ public static String generateImageName(){ String name = String.valueOf(System.currentTimeMillis()) + "_" + randomCode(); return name; } }
運行handleWordToSql測試方法便可!數據庫
效果圖:
原word文檔
apache
處理後的html
數組
對應的html圖片文件夾
瀏覽器
生成的sql
緩存