Python3讀取pdf文檔,輸出內容(txt)html
from urllib.request import urlopen from pdfminer.pdfinterp import PDFResourceManager,process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from io import open import os import re def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content if __name__ == '__main__': # pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") filesdir = "D:\\0.shenma\\01.聊城資料\政府工做報告\\2019政府工做報告全文" os.chdir(filesdir) files = os.listdir() print(files) for file in files: if file.endswith(".pdf"): pdfFile = open(file, 'rb') outputString = readPDF(pdfFile) # print(outputString) try: outputString2 = outputString.replace("\n","") gdp = re.findall("生產總值(完成)?(.+?)億元", outputString2)[0][1] print(file,"--","生產總值完成","--", gdp) ggyssr = re.findall("公共預算收入(完成)?(.+?),", outputString2)[0][1] print(file, "--", "通常公共預算收入完成","--", ggyssr) except: print(file, "--", "no data") # fh = open(file+".txt", 'w+', encoding="utf-8") # fh.write(outputString2) # fh.close() pdfFile.close()
【轉自】:http://www.javashuo.com/article/p-kcqpmcbm-dt.htmlpython
僅作記錄,供查。api