基於Python3.x pandas實現大疫情的查重功能
系統使用 win7 32位 或 64位簡單測試沒有問題,XP及win10未測試,若是感興趣的小夥伴能夠參考源碼,自行使用Python運行!!
查重的文件請使用CSV格式的文件,最好爲大疫情下載以後,僅刪除頭兩行空白的csv文件
初衷
- 每月要對疫情卡片進行查重,而服務器只提供一段時間的查重功能,沒法對指定時間的卡片進行查重!
- 用Excel查重,效率過低。若是文件太大的話比較吃電腦的配置!
思路及代碼
- 用Python自帶的TK作成GUI更加廣泛好用。
- 用pandas包來實現數據的查重功能,具體見代碼註釋!源代碼見文後:
查重的相關規則
- 默認爲名字拼音+疾病名稱 與 身份證號+疾病名稱的查重的交集(身份證查重的優先級高於名字拼音),如勾選了 性別 現住址國標的話,則判斷重卡的依據就是 姓名拼音+性別+現住址+疾病名稱爲相同則斷定爲重卡!!其它相似!
其它注意事項:
- 由於爲python打包而來,因此雙擊以後請稍等(解析速度較慢)!
- 若是殺毒軟件提示,請點擊容許容許!
- 若是雙擊以後出現找不到什麼動態庫,以下界面的時候,請安裝前往https://www.microsoft.com/zh-... 下載安裝VC2015以後在運行
- 若是安裝以上下載文件出錯時,請用殺毒軟件下載系統更新補丁後重試!
- 若是在使用過程當中有什麼疑問的或好的建議的,能夠發送郵件到ztwenxing@dingtalk.com(有時間的會回覆)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pinyin import pinyin
from tkinter import Tk, filedialog, Canvas, messagebox, StringVar, IntVar
from tkinter import Label, Entry, Button, Checkbutton
# from PIL import ImageTk, Image
from pandas import DataFrame, read_csv, Series, to_datetime
##設置窗口界面
window = Tk()
# 設置標題
window.title('大疫情查重用')
# 設置窗口大小
window.geometry('700x395')
# frame=Frame(window)
# frame.pack()
# 設置背景圖片
canvas = Canvas(window, width=700, height=395, bd=0, highlightthickness=0)
# imgpath = 'giphy.gif'
# 設置背景圖片
# img = Image.open("C:/Users/Administrator/chachong/ztcdc3.png")
# photo = ImageTk.PhotoImage(image=img)
width = 700
height = 395
# 畫布設置數值爲中心點的數值
# canvas.create_image(width/2, height/2, image=photo)
canvas.create_image(width / 2, height / 2)
canvas.pack()
# 設置標籤1
##設置輸入界面
#
label_text = Label(window, text="此小程序主要用大疫情網絡的疫情卡片查重!!!")
# label_text.grid(row=0,column= 1)
# 設置高度的等分比例
height_x = height / (height / 39.5)
col_num = 1
canvas.create_window(width * (9 / 18), height_x * col_num, window=label_text)
col_num = 2
def choiceFileCallBack():
# 選擇文件
filenames = filedialog.askopenfilenames(filetypes=[("csv文件", "*.csv")])
# filenames = filedialog.askopenfilenames()
if len(filenames) != 0:
if str(u"csv") in filenames[0]:
en_text.set("")
string_filename = ""
string_filename = str(filenames[0])
# 設置Lb1的屬性
en_text.set(string_filename)
# 設置bt3的激活屬性
button3.configure(state="active")
else:
en_text.set("")
messagebox.showinfo("請選擇csv文件格式", "未選擇csv格式文件,請從新選擇!!")
button3.configure(state="disabled")
else:
en_text.set("")
messagebox.showinfo("未選擇", "未選擇文件,請選擇")
# 設置bt3的激活屬性
button3.configure(state="disabled")
label1 = Label(window, text="須要讀取的文件路徑:", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label1)
en_text = StringVar() # 綁定listbox的列表值
en_text.set((''))
entry = Entry(window, textvariable=en_text)
canvas.create_window(width * (9 / 18), height_x * col_num, width=360, window=entry)
button1 = Button(window, text="選擇", command=choiceFileCallBack)
canvas.create_window(width * (16 / 18), height_x * col_num, width=80, height=30, window=button1)
# 選擇保存路徑
def choiceSaveCallBack():
# 選擇文件
filename = filedialog.asksaveasfilename(filetypes=[("csv文件", "*.csv")])
if filename != "":
##判斷是否爲csv格式的文件
en1_text.set("")
string_filename = ""
string_filename = str(filename) + ".csv"
# 設置Lb1的屬性
en1_text.set(string_filename)
# 設置bt3的激活屬性
button3.configure(state="active")
else:
en1_text.set("")
# 設置bt3的激活屬性
messagebox.showinfo("未選擇", "未選擇保存路徑請選擇!")
button3.configure(state="disabled")
col_num = 3
label2 = Label(window, text="查重的保存路徑及文件名:", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label2)
en1_text = StringVar() # 綁定listbox的列表值
en1_text.set((''))
entry1 = Entry(window, textvariable=en1_text)
canvas.create_window(width * (9 / 18), height_x * col_num, width=360, window=entry1)
button2 = Button(window, text="選擇", command=choiceSaveCallBack)
canvas.create_window(width * (16 / 18), height_x * col_num, width=80, height=30, window=button2)
###設置邏輯層
# 點擊OK按鈕的函數
def hellook():
# 設置查重list
name_lists = ("患者姓名", "有效證件號", "性別", '聯繫電話', '現住地址國標', '疾病名稱', '病例分類', '病例分類2')
check_lists = [CheckVar1.get(), CheckVar2.get(), CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(),
CheckVar7.get(), CheckVar8.get()]
check_list_pd = DataFrame({"name_lists": name_lists, "check_lists": check_lists})
check_list = check_list_pd[check_list_pd["check_lists"] == 1]["name_lists"]
check_list = check_list.values.tolist()
print(check_list)
try:
rc_data = read_csv(filepath_or_buffer=entry.get(), encoding="GB18030")
except:
open_error = messagebox.showinfo(title='unfortunately ', message='打開文件出錯,請檢查!')
messagebox.showinfo(title='unfortunately ', message='打開文件出錯,請檢查!')
print(open_error)
### 拼音轉換函數
def hanzi2pinyin(sr, *args, **kwargs):
list = []
for i in sr:
list.append(pinyin.get(i, format="strip", delimiter=""))
return list
##重卡生成函數
def shengcheng_chongka(data, checklist, aeslist="報告卡錄入時間"):
# data爲查重的列,chcklist爲查重列的合併list
# 合併對應的列
colwx = DataFrame(Series([""] * data.__len__()).str.cat(data[checklist], na_rep="_"))
colwx = colwx.rename(columns={0: "chachong"})
# 生成查重的數據格式
chachong_data_sf = data.reset_index(drop=True)
chachong_data_sf = chachong_data_sf.merge(colwx, left_index=True, right_index=True)
chachong_data_sf = chachong_data_sf.sort_values(by=["chachong", aeslist])
chachong_data_sf = chachong_data_sf[chachong_data_sf.duplicated(subset="chachong", keep=False)]
# 設置Index
chachong_data_sf = chachong_data_sf.reset_index(drop=True)
# 生成重複的數據的例數
chachong_num = chachong_data_sf["chachong"].value_counts()
chachong_num = DataFrame(chachong_num)
chachong_num = chachong_num.rename(columns={0: "chachong"})
# 生成重複的數據1
chachong_data_sf_first = chachong_data_sf.drop_duplicates(subset="chachong", keep="first")
# 生成重複的數據2
chachong_data_sf_last = chachong_data_sf.drop_duplicates(subset="chachong", keep="last")
chachong_data_sf_last = chachong_data_sf_last.join(chachong_num, on="chachong", lsuffix='_last', rsuffix='_f')
# 合併重複列
# 合併爲最後的數據
zong_sf = chachong_data_sf_last.join(chachong_data_sf_first.set_index("chachong"), on="chachong_last",
lsuffix='_last', rsuffix='_f')
zong_sf = zong_sf.drop(columns=['name_last', 'chachong_last', 'name_f'])
zong_sf = zong_sf.rename(columns={"chachong_f": "重複卡片數"})
return zong_sf
### 讀取數據
# rc_data=read_csv(r'C:/Users/Administrator/Desktop/2015010120181231#reprot.csv',encoding="GB18030")
rc_data1 = rc_data.copy()
rc_data1["name"] = hanzi2pinyin(rc_data1["患者姓名"])
rc_data1["現住地址國標"] = rc_data1["現住地址國標"].map(str)
rc_data1["報告卡錄入時間"] = to_datetime(rc_data1["報告卡錄入時間"])
## 根據checkbox選擇對應的數據列查重
name_lists = ["name", "有效證件號", "性別", '聯繫電話', '現住地址國標', '疾病名稱', '病例分類', '病例分類2']
# check_lists = [CheckVar1.get(), CheckVar2.get(), CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(),CheckVar7.get(), CheckVar8.get()]
# 身份證選擇範圍列名字默認爲空
check_lists_sf = [0, 1, CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(), CheckVar7.get(),
CheckVar8.get()]
check_list_pd_sf = DataFrame({"name_lists": name_lists, "check_lists": check_lists_sf})
check_list_sf = check_list_pd_sf[check_list_pd_sf["check_lists"] == 1]["name_lists"].values.tolist()
###排除身份證爲空的爲數據
chachong_data_sf = rc_data1.dropna(subset=["有效證件號"])
zong_sf = shengcheng_chongka(data=chachong_data_sf, checklist=check_list_sf)
# 經過姓名加其它條件查重
check_lists_nm = [1, 0, CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(), CheckVar7.get(),
CheckVar8.get()]
check_list_pd_nm = DataFrame({"name_lists": name_lists, "check_lists": check_lists_nm})
check_list_nm = check_list_pd_nm[check_list_pd_nm["check_lists"] == 1]["name_lists"].values.tolist()
zong_nm = shengcheng_chongka(data=rc_data1, checklist=check_list_nm)
##兩個數據的合併
zong = zong_sf.append(zong_nm)
zong = zong.drop_duplicates(subset="卡片編號_last", keep="first")
zong = zong.sort_values(by=["報告單位地區編碼_last", "報告卡錄入時間_last"], ascending=False)
try:
zong_sf.to_csv(entry1.get(), index=False, encoding="GB18030")
infomessage = "查重完畢!文件保存在{}".format(entry1.get())
messagebox.showinfo(title='unfortunately ', message=infomessage)
except:
# save_error = messagebox.showinfo(title='unfortunately ', message='保存文件出錯,請檢查!')
messagebox.showinfo(title='unfortunately ', message='保存文件出錯,請檢查!')
col_num = 4
button3 = Button(window, text="OK", command=hellook)
canvas.create_window(width * (9 / 18), height_x * col_num, width=80, height=30, window=button3)
col_num = 5
# 基礎信息
label3 = Label(window, text="基礎信息:", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label3)
# 建立選擇查重的條件
col_num = 5.8
CheckVar1 = IntVar()
CheckVar2 = IntVar()
CheckVar3 = IntVar()
CheckVar4 = IntVar()
CheckVar5 = IntVar()
C1 = Checkbutton(window, text="姓 名", variable=CheckVar1, onvalue=1, offvalue=0, height=5, width=20,
state="disabled")
C1.select()
C2 = Checkbutton(window, text="有效證件號", variable=CheckVar2, onvalue=1, offvalue=0, height=5, width=20, state="disabled")
C2.select()
C3 = Checkbutton(window, text="性 別", variable=CheckVar3, onvalue=1, offvalue=0, height=5, width=20)
C3.select()
C4 = Checkbutton(window, text="聯繫電話", variable=CheckVar4, onvalue=1, offvalue=0, height=5, width=20)
C4.select()
C5 = Checkbutton(window, text="現住地址國標", variable=CheckVar5, onvalue=1, offvalue=0, height=5, width=20)
C5.select()
# C1.select()
canvas.create_window(width * (2 / 18), height_x * col_num, width=80, height=30, window=C1)
canvas.create_window(width * (5 / 18), height_x * col_num, width=80, height=30, window=C2)
canvas.create_window(width * (8 / 18), height_x * col_num, width=80, height=30, window=C3)
canvas.create_window(width * (11 / 18), height_x * col_num, width=80, height=30, window=C4)
canvas.create_window(width * (14 / 18), height_x * col_num, width=100, height=30, window=C5)
col_num = 6.4
label4 = Label(window, text="疾病信息:", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label4)
col_num = 7.2
CheckVar6 = IntVar()
CheckVar7 = IntVar()
CheckVar8 = IntVar()
C6 = Checkbutton(window, text="疾病名稱", variable=CheckVar6, onvalue=1, offvalue=0, height=5, width=20, state="disabled")
C6.select()
# C3.select()
C7 = Checkbutton(window, text="病例分類", variable=CheckVar7, onvalue=1, offvalue=0, height=5, width=20)
# C4.select()
C8 = Checkbutton(window, text="病例分類2", variable=CheckVar8, onvalue=1, offvalue=0, height=5, width=20)
canvas.create_window(width * (2 / 18), height_x * col_num, width=80, height=30, window=C6)
canvas.create_window(width * (5 / 18), height_x * col_num, width=80, height=30, window=C7)
canvas.create_window(width * (8 / 18), height_x * col_num, width=80, height=30, window=C8)
col_num = 8
label4 = Label(window, text="問題反饋:ztwenxing@dingtalk.com 源碼及說明:https://segmentfault.com/a/1190000018570381", )
canvas.create_window(width * (9 / 18), height_x * col_num, window=label4)
window.mainloop()