一:下載所須要的庫python
1 :pdfminer 安裝庫命令 pip install pdfminer3kapi
pdfminer3k是pdfminer的Python 3端口。PDFMiner是從PDF文檔中提取信息的工具。與其餘PDF相關的工具不一樣,它徹底專一於獲取和分析文本數據。PDFMiner容許獲取頁面中文本的確切位置,以及其餘信息,如字體或線條。它包含一個PDF轉換器,能夠將PDF文件轉換爲其餘文本格式(如HTML)。它有一個可擴展的PDF解析器,可用於其餘目的而不是文本分析。工具
2: docx 安裝庫命令 pip install python_docx字體
Python DocX目前是Python OpenXML的一部分,你能夠用它打開Word 2007及之後的文檔,而用它保存的文檔能夠在Microsoft Office 2007/2010, Microsoft Mac Office 2008, Google Docs, OpenOffice.org 3, and Apple iWork 08中打開。url
from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFTextExtractionNotAllowed from docx import Document document = Document() import warnings warnings.filterwarnings("ignore") from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from urllib.request import urlopen import pandas as pd def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content def save_to_file(file_name, contents): fh = open(file_name, 'w') fh.write(contents) fh.close() save_to_file('mobiles.txt', 'your contents str') def main(): pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") outputString = readPDF(pdfFile)
#c.word save_to_file('c.csv',outputString) if __name__ == '__main__': main()
使用docx 保存爲wordspa
from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFTextExtractionNotAllowed from docx import Document document = Document() import warnings warnings.filterwarnings("ignore") import os file_name=os.open('/Users/dudu/Desktop/test1/a.pdf',os.O_RDWR ) def main(): fn = open(file_name,'rb') parser = PDFParser(fn) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) resource = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource,laparams=laparams) interpreter = PDFPageInterpreter(resource,device) for i in doc.get_pages(): interpreter.process_page(i) layout = device.get_result() for out in layout: if hasattr(out,"get_text"): content = out.get_text().replace(u'\xa0', u' ') document.add_paragraph( content, style='ListBullet' ) document.save('a'+'.docx') print ('處理完成') if __name__ == '__main__': main()
加下面的公衆號,我會按期發一些資料。code