word和.txt文件轉html 及pdf文件，使用poi jsoup itext心得

時間 2019-11-18

標籤 word txt 文件 html pdf 使用 poi jsoup itext 心得欄目 Microsoft Office 简体版

原文原文鏈接

 word和.txt文件轉html 及pdf文件， 使用poi jsoup  itext心得
本人第一次寫博客，有上面不足的或者須要改正的但願你們指出來，一塊兒學習交流討論。
因爲在項目中遇到了這一個問題，在網上也找了不少方法，感受千篇一概，總有一些問題，所以總結出word轉html和pdf文件使用方法。
雖然poi功能不是很強大，但畢竟不依靠本地office軟件，一樣還有一種方式使用jacob也能夠將word轉html，不過這個方式要依靠本地office，並且只能在windows平臺下，不支持unix系統。
jacob使用起來仍是比較簡單的，若是你們須要jacob的使用方法，我會分享給你們。
關於.txt文件轉html，就是使用io操做將.txt文件讀取出來而後寫入到html中，也不須要額外的jar包。

注意：使用poi須要注意如下幾項，因爲我在作這個功能的時候沒有注意這個問題的存在，一直找不出緣由，還請有關大牛門指正一下爲何？

    1.使用office的文檔.doc和.docx格式的都沒有問題，但使用wps生成的word文檔時，只能轉.doc格式的文件，對.docx的文檔轉出後沒有圖片，得不到img屬性。
    2.在使用word文檔轉pdf格式的文件時，生成的pdf沒有中文，對中文顯示不是很支持。
    3.在將word轉成pdf時，須要把生成的html文件轉化成標準的html文件，否則解析後會出現<meta>或者<img>標籤不閉合的狀況。
    4.使用的jar包以下，均可以在maven中央倉庫下載獲得。

下面就直接附上代碼了，但願你們有什麼問題在下面評論互相交流和學習，使用時直接調用方法便可。若是你們以爲能夠請點一個贊，謝謝你們。package com.kqco.tools;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
import org.xhtmlrenderer.pdf.ITextFontResolver;
import org.xhtmlrenderer.pdf.ITextRenderer;
import com.lowagie.text.pdf.BaseFont;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.file.Path;
import java.nio.file.Paths;

public class FileConverter {
	/*
	 * word文件轉成html文件 
	 * sourceFilePath:源word文件路徑 
	 * targetFilePosition:轉化後生成的html文件路徑
	 */
	public void wordToHtml(String sourceFilePath, String targetFilePosition) throws Exception {
		if (".docx".equals(sourceFilePath.substring(sourceFilePath.lastIndexOf(".", sourceFilePath.length())))) {
			docxToHtml(sourceFilePath, targetFilePosition);
		} else if (".doc".equals(sourceFilePath.substring(sourceFilePath.lastIndexOf(".", sourceFilePath.length())))) {
			docToHtml(sourceFilePath, targetFilePosition);
		} else {
			throw new RuntimeException("文件格式不正確");
		}

	}

	/*
	 * doc轉換爲html
	 * sourceFilePath:源word文件路徑 
	 * targetFilePosition:生成的html文件路徑
	 */
	 
	private void docToHtml(String sourceFilePath, String targetFilePosition) throws Exception {
		final Path imagePath = Paths.get(targetFilePosition).getParent().resolve("image");
		HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(sourceFilePath));
		Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
		WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
		// 保存圖片，並返回圖片的相對路徑
		wordToHtmlConverter.setPicturesManager(new PicturesManager() {
			@Override
			public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) {
				try (FileOutputStream out = new FileOutputStream(imagePath.resolve(name).toString())) {
					out.write(content);
				} catch (Exception e) {
					e.printStackTrace();
				}
				return "../tmp/image/" + name;
			}
		});
		wordToHtmlConverter.processDocument(wordDocument);
		Document htmlDocument = wordToHtmlConverter.getDocument();
		DOMSource domSource = new DOMSource(htmlDocument);
		StreamResult streamResult = new StreamResult(new File(targetFilePosition));
		TransformerFactory tf = TransformerFactory.newInstance();
		Transformer serializer = tf.newTransformer();
		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
		serializer.setOutputProperty(OutputKeys.INDENT, "yes");
		serializer.setOutputProperty(OutputKeys.METHOD, "html");
		serializer.transform(domSource, streamResult);
	}

	 
	/*
	 * docx轉換爲html
	 * sourceFilePath:源word文件路徑 
	 * targetFileName:生成的html文件路徑
	 */

	private void docxToHtml(String sourceFilePath, String targetFileName) throws Exception {
		String imagePathStr = Paths.get(targetFileName).getParent().resolve("../tmp/image/word/media").toString();
		OutputStreamWriter outputStreamWriter = null;
		try {
			XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFilePath));
			XHTMLOptions options = XHTMLOptions.create();
			// 存放圖片的文件夾
			options.setExtractor(new FileImageExtractor(new File(imagePathStr)));
			// html中圖片的路徑
			options.URIResolver(new BasicURIResolver("../tmp/image/word/media"));
			outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "UTF-8");
			XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
			xhtmlConverter.convert(document, outputStreamWriter, options);
		} finally {
			if (outputStreamWriter != null) {
				outputStreamWriter.close();
			}
		}
	}
	
	
	/*
	 * txt文檔轉html
	   filePath:txt原文件路徑
	   htmlPosition:轉化後生成的html路徑
	
	*/
	public void txtToHtml(String filePath, String htmlPosition) {
		try {
			String encoding = "GBK";
			File file = new File(filePath);
			if (file.isFile() && file.exists()) { // 判斷文件是否存在
				InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
				// 考慮到編碼格式
				BufferedReader bufferedReader = new BufferedReader(read);
				// 寫文件
				FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
				OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
				BufferedWriter bw = new BufferedWriter(osw);
				String lineTxt = null;
				while ((lineTxt = bufferedReader.readLine()) != null) {
					bw.write(lineTxt + "</br>");
				}
				bw.close();
				osw.close();
				fos.close();
				read.close();
			} else {
				System.out.println("找不到指定的文件"); 
			}
		} catch (Exception e) {
			System.out.println("讀取文件內容出錯");
			e.printStackTrace();
		}
	}
	
	
	/*
	移動圖片到指定路徑
	sourceFilePath:原始路徑
	targetFilePosition:移動後存放的路徑
	*/
	
	 public  void changeImageUrl(String sourceFilePath,String targetFilePosition) throws IOException {
	        FileInputStream fis = new FileInputStream(sourceFilePath);
	        BufferedInputStream bufis = new BufferedInputStream(fis);
	 
	        FileOutputStream fos = new FileOutputStream(targetFilePosition);
	        BufferedOutputStream bufos = new BufferedOutputStream(fos);
	        int len = 0;
	        while ((len = bufis.read()) != -1) {
	            bufos.write(len);
	        }
	        bufis.close();
	        bufos.close();
	    }
	
	

	/*
	 * html文件解析成xhtml，變成標準的html文件
	 * f_in:源html文件路徑 
	 * outfile: 輸出後xhtml的文件路徑
	 */
	private boolean parseToXhtml(String f_in, String outfile) {
		boolean bo = false;
		ByteArrayOutputStream tidyOutStream = null; // 輸出流
		FileInputStream fis = null;
		ByteArrayOutputStream bos = null;
		ByteArrayInputStream stream = null;
		DataOutputStream to = null;
		try {
			// Reader reader;
			fis = new FileInputStream(f_in);
			bos = new ByteArrayOutputStream();
			int ch;
			while ((ch = fis.read()) != -1) {
				bos.write(ch);
			}
			byte[] bs = bos.toByteArray();
			bos.close();
			String hope_gb2312 = new String(bs, "gb2312");// 注意，默認是GB2312，因此這裏先轉化成GB2312而後再轉化成其餘的。
			byte[] hope_b = hope_gb2312.getBytes();
			String basil = new String(hope_b, "gb2312");// 將GB2312轉化成 UTF-8
			stream = new ByteArrayInputStream(basil.getBytes());
			tidyOutStream = new ByteArrayOutputStream();
			Tidy tidy = new Tidy();
			tidy.setInputEncoding("gb2312");
			tidy.setQuiet(true);
			tidy.setOutputEncoding("UTF-8");
			tidy.setShowWarnings(true); // 不顯示警告信息
			tidy.setIndentContent(true);//
			tidy.setSmartIndent(true);
			tidy.setIndentAttributes(false);
			tidy.setWraplen(1024); // 多長換行
			// 輸出爲xhtml
			tidy.setXHTML(true);
			tidy.setErrout(new PrintWriter(System.out));
			tidy.parse(stream, tidyOutStream);
			to = new DataOutputStream(new FileOutputStream(outfile));// 將生成的xhtml寫入
			tidyOutStream.writeTo(to);
			bo = true;
		} catch (Exception ex) {
			System.out.println(ex.toString());
			ex.printStackTrace();
			return bo;
		} finally {
			try {
				if (to != null) {
					to.close();
				}
				if (stream != null) {
					stream.close();
				}
				if (fis != null) {
					fis.close();
				}
				if (bos != null) {
					bos.close();
				}
				if (tidyOutStream != null) {
					tidyOutStream.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			System.gc();
		}
		return bo;
	}

	/*
	 * xhtml文件轉pdf文件 
	 * inputFile:xhtml源文件路徑
	 * outputFile:輸出的pdf文件路徑
	 * imagePath:圖片的存放路徑   例如(file:/D:/test)
	 */
	private boolean convertHtmlToPdf(String inputFile, String outputFile) throws Exception {
		OutputStream os = new FileOutputStream(outputFile);
		ITextRenderer renderer = new ITextRenderer();
		String url = new File(inputFile).toURI().toURL().toString();
		renderer.setDocument(url);
		// 解決中文支持問題
		ITextFontResolver fontResolver = renderer.getFontResolver();
		fontResolver.addFont("C:/Windows/Fonts/simsun.ttc", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
		// 解決圖片的相對路徑問題
		renderer.getSharedContext().setBaseURL("imagePath");
		renderer.layout();
		renderer.createPDF(os);
		os.flush();
		os.close();
		return true;
	}

	/*
	 * xhtml轉成標準html文件 
	 * targetHtml:要處理的html文件路徑
	 */
	private static void standardHTML(String targetHtml) throws IOException {
		File f = new File(targetHtml);
		org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
		doc.select("meta").removeAttr("name");
		doc.select("meta").attr("content", "text/html; charset=UTF-8");
		doc.select("meta").attr("http-equiv", "Content-Type");
		doc.select("meta").html("&nbsp");
		doc.select("img").html("&nbsp");
		doc.select("style").attr("mce_bogus", "1");
		doc.select("body").attr("font-family", "SimSun");
		doc.select("html").before("<?xml version='1.0' encoding='UTF-8'>");
		/*
		 * Jsoup只是解析，不能保存修改，因此要在這裏保存修改。
		 */
		FileOutputStream fos = new FileOutputStream(f, false);
		OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
		osw.write(doc.html());
		System.out.println(doc.html());
		osw.close();
	}
}

相關標籤/搜索

文件

word和.txt文件轉html 及pdf文件， 使用poi jsoup itext心得

word和.txt文件轉html 及pdf文件，使用poi jsoup itext心得