tika是個功能強大的項目,這裏展現下如何使用tika來將pdf轉爲html。html
<!--start of tika--> <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core --> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>1.16</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers --> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers</artifactId> <version>1.16</version> </dependency> <!--end of tika -->
public static String extractHtml(File file) throws IOException { byte[] bytes = Files.toByteArray(file); AutoDetectParser tikaParser = new AutoDetectParser(); ByteArrayOutputStream out = new ByteArrayOutputStream(); SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler; try { handler = factory.newTransformerHandler(); } catch (TransformerConfigurationException ex) { throw new IOException(ex); } handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8"); handler.setResult(new StreamResult(out)); ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler); try { tikaParser.parse(new ByteArrayInputStream(bytes), handler1, new Metadata()); } catch (SAXException | TikaException ex) { throw new IOException(ex); } return new String(out.toByteArray(), "UTF-8"); }
效果對比的話,pdfDom的效果好一點。apache