模塊準備 : pip install pdfminer.six加密
import refrom pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManagerfrom pdfminer.converter import TextConverter, PDFPageAggregatorfrom pdfminer.layout import LAParamsfrom pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfdevice import PDFDevicefrom pdfminer.pdfpage import PDFPageroot= r'C:\Users\jiaotianhang\Downloads\pdf'# 獲取pdf文檔fp = open('%s/%s'%(root,'ghi.pdf'), 'rb')# 建立一個與文檔相關的解釋器parser = PDFParser(fp)# pdf文檔的對象,與解釋器鏈接起來doc = PDFDocument(parser=parser)parser.set_document(doc=doc)# 若是是加密pdf,則輸入密碼# doc._initialize_password()# 建立pdf資源管理器resource = PDFResourceManager()# 參數分析器laparam = LAParams()# 建立一個聚合器device = PDFPageAggregator(resource, laparams=laparam)# 建立pdf頁面解釋器interpreter = PDFPageInterpreter(resource, device)# 獲取頁面的集合for page in PDFPage.get_pages(fp): # 使用頁面解釋器來讀取 interpreter.process_page(page) # 使用聚合器來獲取內容 layout = device.get_result() for out in layout: if hasattr(out, 'get_text'): # print(out.get_text()) ooo = re.sub(r'\(cid:\d+\)','',out.get_text()) # 寫入txt文件 if ooo.strip(): fw = open('exam3.txt', 'a',encoding='utf-8') fw.write(ooo) fw.close() # fw.write(out.get_text())