將pdf中的文字讀取處理還有一些限制:1. 文檔的安全屬性不能過於嚴格 2. 不能存在圖片。java
有兩種讀取方式,maven對應的pom文件apache
<dependencies> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>1.8.8</version> </dependency> <dependency> <groupId>com.itextpdf</groupId> <artifactId>itextpdf</artifactId> <version>5.0.6</version> </dependency> </dependencies>
/** * PdfboxUtil.java */ package com.hsm.pdfTest; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.InputStream; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; /** * @author hsm */ public class PdfboxUtil { private static String PDFPATH = "D:/Maven權威指南中文版.pdf"; private static String FILEPATH = "D:/Maven權威指南中文版.doc"; public static void main(String[] args) throws Exception { String content=getPdfContent(PDFPATH); toFile(content,FILEPATH); } /** * 獲取pdf的內容<br/> * @param pdfPath * @return * @throws Exception */ private static String getPdfContent(String pdfPath) throws Exception { boolean sort = false;// 是否排序 int startPage = 1;// 開始提取頁數 int endPage = Integer.MAX_VALUE; // 結束提取頁數 String content = null;//暫時存放pdf內容 InputStream input = null; File pdfFile = new File(pdfPath); PDDocument document = null; try { input = new FileInputStream(pdfFile); // 加載 pdf 文檔 PDFParser parser = new PDFParser(input); parser.parse(); document = parser.getPDDocument(); // 獲取內容信息 PDFTextStripper pts = new PDFTextStripper(); pts.setSortByPosition(sort); endPage = document.getNumberOfPages(); System.out.println("Total Page: " + endPage); pts.setStartPage(startPage); pts.setEndPage(endPage); try { content = pts.getText(document); }catch(Exception e) { throw e; } System.out.println("Get PDF Content ..."); }catch(Exception e){ throw e; } finally { if (null != input) input.close(); if (null != document) document.close(); } return content; } private static void toFile(String content,String filePath) { try { File f = new File(filePath); if (!f.exists()) { f.createNewFile(); } System.out.println("Write PDF Content to txt file ..."); BufferedWriter output = new BufferedWriter(new FileWriter(f)); output.write(content); output.close(); } catch (Exception e) { e.printStackTrace(); } } }
package com.hsm.pdfTest; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; /** * @author hsm */ public class ItextpdfUtil { private static String PDFPATH = "D:/Maven權威指南中文版.pdf"; private static String FILEPATH = "D:/Maven權威指南中文版.doc"; public static void main(String[] args) { String content=getPdfContent(PDFPATH); System.out.println(content); toFile(PDFPATH,FILEPATH); } /** * 獲取pdf的內容 * @param pdfPath * @return */ private static String getPdfContent(String pdfPath) { PdfReader reader = null; StringBuffer buff = new StringBuffer(); try { reader = new PdfReader(pdfPath); PdfReaderContentParser parser = new PdfReaderContentParser(reader); int num = reader.getNumberOfPages();// 得到頁數 TextExtractionStrategy strategy; for (int i = 1; i <= num; i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); buff.append(strategy.getResultantText()); } } catch (IOException e) { e.printStackTrace(); } return buff.toString(); } /** * 將對應的pdf文件讀到指定的文件中 * @param pdfPath * @param filePath */ private static void toFile(String pdfPath, String filePath) { PrintWriter writer = null; PdfReader reader = null; try { writer = new PrintWriter(new FileOutputStream(filePath)); reader = new PdfReader(pdfPath); int num = reader.getNumberOfPages();// 得到頁數 System.out.println("Total Page: " + num); StringBuffer content = new StringBuffer(""); // 存放讀取出的文檔內容 for (int i = 1; i <= num; i++) { // 讀取第i頁的文檔內容 content.append(PdfTextExtractor.getTextFromPage(reader, i)); } writer.write(content.toString());// 寫入文件內容 writer.flush(); writer.close(); } catch (IOException e) { e.printStackTrace(); } } }