PDFBox(一個BSD許可下的源碼開放項目)是一個爲開發人員讀取和建立PDF文檔而準備的純Java類庫。java
主要功能:
提取文本,包括Unicode字符
和Jakarta Lucene等文本搜索引擎的整合過程十分簡單
加密/解密PDF文檔
從PDF和XFDF格式中導入或導出表單數據
向已有PDF文檔中追加內容
將一個PDF文檔切分爲多個文檔
覆蓋PDF文檔apache
示例代碼使用的jaride
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.14</version> </dependency>
1.讀取PDF內容搜索引擎
PDDocument helloDocument = null; try { helloDocument = PDDocument.load(new File("XXX.pdf")); PDFTextStripper textStripper = new PDFTextStripper(); System.out.println(textStripper.getText(helloDocument)); helloDocument.close(); } catch (IOException e) { e.printStackTrace(); }
2.讀取PDF內容,與位置加密
public class GetWordsAndPositionFromPDF extends PDFTextStripper{ //用來記錄,PDF中讀到的詞 static List<String> words = new ArrayList<String>(); public GetWordsAndPositionFromPDF() throws IOException { super(); } /** * 解析PDF文檔 * 從PDF中讀取文字以及文字的位置 */ public static void main( String[] args ) throws IOException { PDDocument document = null; String fileName = "XXX.pdf"; // replace with your PDF file name try { document = PDDocument.load( new File(fileName) ); PDFTextStripper stripper = new GetWordsAndPositionFromPDF(); stripper.setSortByPosition( true ); stripper.setStartPage( 0 ); stripper.setEndPage( document.getNumberOfPages() ); Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); stripper.writeText(document, dummy); // print words for(String word:words){ System.out.println(word); } } finally { if( document != null ) { document.close(); } } } @Override protected void writeString(String text) throws IOException{ String[] wordsInStream = text.split(getWordSeparator()); if(wordsInStream!=null){ for(String word :wordsInStream){ words.add(word); } } } @Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException{ //讀到的詞,以及位置 for (int i = 0; i < textPositions.size(); i++) { String str = textPositions.get(i).getUnicode(); String x = "x=" + textPositions.get(i).getX(); String y = "y=" + textPositions.get(i).getY(); System.out.println(" textPositions.get(i) = " + str + " x="+x + " y="+y); } writeString(text); } }