電子發票太多,想統計下總額異常困難,網上工具很差用,花了2個小時實現一份,測試過中石油、京東開具的電子發票還行,部分發票名稱失敗有問題不影響統計,有須要的小夥伴本身拿去改吧。python
import cmd import sys import json import pdfplumber import os from pprint import pprint class FapiaoShell(cmd.Cmd): """ 發票 """ intro = '歡迎使用發票提取工具,輸入?(help)獲取幫助消息和命令列表,CTRL+C退出程序。\n' prompt = '\n輸入命令: ' doc_header = "詳細文檔 (輸入 help <命令>):" misc_header = "友情提示:" undoc_header = "沒有幫助文檔:" nohelp = "*** 沒有命令(%s)的幫助信息 " def __init__(self): super().__init__() def do_load(self, arg): """ 加載發票 例如:load D:\ """ if not os.path.isdir(arg): print('參數必須是目錄!') return os.chdir(os.path.dirname(arg)) pdfs = [] for root, _, files in os.walk(arg): for fn in files: ext = os.path.splitext(fn)[1].lower() if ext != '.pdf': continue fpth = os.path.join(root, fn) fpth = os.path.relpath(fpth) print(f'發現pdf文件: {fpth}') pdfs.append(fpth) pdf_ctxs = self._parse_pdfs(pdfs) total = { '內容': pdf_ctxs, '發票數': len(pdf_ctxs), '總計': 0, } for fpth, info in pdf_ctxs: total['總計'] += float(info['總計']) print('\n保存到 結果.json...') with open("結果.json", 'w', encoding='utf-8') as json_file: json.dump(total, json_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(', ', ': ')) print('完成!') def _parse_pdfs(self, pdfs): """ 分析 """ result = [] for fpth in pdfs: info = {} with pdfplumber.open(fpth) as pdf: page = pdf.pages[0] if '增值稅電子普通發票' not in ''.join(page.extract_text()): result.append((fpth, {})) inf = self._extrace_from_words(page.extract_words()) info.update(inf) inf = self._extrace_from_table(page.extract_tables()[0]) info.update(inf) result.append((fpth, info)) return result def _extrace_from_words(self, words): """ 從單詞中提取 """ info = {} lines = {} for word in words: top = int(word['top']) bottom = int(word['bottom']) pos = (top + bottom) // 2 text = word['text'] if pos not in lines: lines[pos] = [text] else: lines[pos].append(text) lines_pack = [] last_pos = None for pos in sorted(lines): arr = lines[pos] if len(lines_pack) > 0 and pos - last_pos <= 10: lines_pack[-1] += arr continue lines_pack.append(arr) last_pos = pos continue for pack in lines_pack: for idx, line in enumerate(pack): if '電子普通發票' in line: info['標題'] = line continue if '發票代碼:' in line: info['發票代碼'] = line.split(':')[1] continue if '發票號碼:' in line: info['發票號碼'] = line.split(':')[1] continue if '開票日期:' in line: year = line.split(':')[1] month = [ln for ln in pack if ln.isdigit()][0] day = [ln[:2] for ln in pack if '日' in ln][0] info['開票日期'] = f'{year}-{month}-{day}' continue if '機器編號:' in line: info['機器編號'] = [ln for ln in pack if ln.isdigit() and len(ln) > 10][0] continue if '碼:' in line: c1 = pack[idx].split(':')[1] c2 = pack[idx+1] c3 = pack[idx+2] c4 = pack[idx+3] info['校驗碼'] = f'{c1} {c2} {c3} {c4}' continue if '收款人:' in line: info['收款人'] = line.split(':')[1] continue if '開票人:' in line: info['開票人'] = line.split(':')[1] continue return info def _extrace_from_table(self, table): """ 從表中提取 """ info = {} if len(table) != 4: return None # 購買方 for cell in table[0]: if not cell: continue lines = cell.splitlines() for line in lines: if '名 稱:' in line: info['購買方名稱'] = line.split(':')[1] continue if len(line) == 18 and line.isalnum(): info['購買方稅號'] = line continue if len(line) == 27: if '密碼' not in info: info['密碼'] = [] info['密碼'].append(line) continue # 詳細 for cell in table[1]: if not cell: continue lines = cell.splitlines() for line in lines: if '貨物或應稅勞務、服務名稱' in line: info['商品'] = lines[1:-1] break if '金 額' in line: info['總金額'] = lines[-1][1:] break if '稅 額' in line: info['總稅額'] = lines[-1][1:] break # 合計 for cell in table[2]: if not cell: continue lines = cell.splitlines() for line in lines: if '¥' in line: info['總計'] = line[1:] # 銷售方 for cell in table[3]: if not cell: continue lines = cell.splitlines() for line in lines: if '名 稱:' in line: info['銷售方名稱'] = line.split(':')[1] continue if len(line) == 18 and line.isalnum(): info['銷售方稅號'] = line continue return info if __name__ == '__main__': try: FapiaoShell().cmdloop() except KeyboardInterrupt: print('\n\n再見!')