哪些是將PDF文件轉換爲文本的最佳Python模塊? html
該PDFMiner包已經改變,由於codeape公佈。 python
編輯(再次): git
PDFMiner已在版本20100213
再次更新 github
您能夠使用如下內容檢查已安裝的版本: 工具
>>> import pdfminer >>> pdfminer.__version__ '20100213'
這是更新版本(包含我更改/添加內容的評論): this
def pdf_to_csv(filename): from cStringIO import StringIO #<-- added so you can copy/paste this to try it from pdfminer.converter import LTTextItem, TextConverter from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item.objs: if isinstance(child, LTTextItem): (_,_,x,y) = child.bbox #<-- changed line = lines[int(-y)] line[x] = child.text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8") #<-- changed # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) #<-- changed parser.set_document(doc) #<-- added doc.set_parser(parser) #<-- added doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
編輯(再次): spa
如下是pypi , 20100619p1
最新版本的更新。 總之我換成LTTextItem
與LTChar
並經過LAParams的實例向CsvConverter構造。 unix
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter #<-- changed from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item.objs: if isinstance(child, LTChar): #<-- changed (_,_,x,y) = child.bbox line = lines[int(-y)] line[x] = child.text.encode(self.codec) for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) #<-- changed # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
編輯(再一次): code
更新版本20110515
(感謝Oeufcoque Penteano!): htm
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item._objs: #<-- changed if isinstance(child, LTChar): (_,_,x,y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
我須要將特定的PDF轉換爲python模塊中的純文本。 我使用PDFMiner 20110515,在閱讀了他們的pdf2txt.py工具後,我編寫了這個簡單的代碼片斷:
from cStringIO import StringIO from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams def to_txt(pdf_path): input_ = file(pdf_path, 'rb') output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) process_pdf(manager, converter, input_) return output.getvalue()
因爲這些解決方案都沒有支持最新版本的PDFMiner,所以我編寫了一個簡單的解決方案,它將使用PDFMiner返回pdf文本。 這適用於那些使用process_pdf
獲取導入錯誤的人
import sys from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO def pdfparser(data): fp = file(data, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() print data if __name__ == '__main__': pdfparser(sys.argv[1])
請參閱如下適用於Python 3的代碼:
import sys from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams import io def pdfparser(data): fp = open(data, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() print(data) if __name__ == '__main__': pdfparser(sys.argv[1])
試試PDFMiner 。 它能夠從PDF文件中提取HTML,SGML或「Tagged PDF」格式的文本。
Tagged PDF格式彷佛是最乾淨的,剝離XML標籤只留下裸文本。
Python 3版本可在如下位置得到:
PDFminer在我試過的pdf文件的每一頁上都給了我一行[第1頁,共7頁]。
我到目前爲止最好的答案是pdftoipe,或者它基於Xpdf的c ++代碼。
看看個人問題是pdftoipe的輸出是什麼樣的。