大疫情数据查重Python

CNZPH 发布于2019-07-31 10:11 / 658人阅读

摘要：如果文件太大的话比较吃电脑的配置思路及代码用自带的做成更加普遍好用。

基于Python3.x pandas实现大疫情的查重功能 系统使用 win7 32位或 64位简单测试没有问题，XP及win10未测试,如果感兴趣的小伙伴可以参考源码，自行使用Python运行！！ 查重的文件请使用CSV格式的文件，最好为大疫情下载之后，仅删除头两行空白的csv文件 初衷

每个月要对疫情卡片进行查重，而服务器只提供一段时间的查重功能，无法对指定时间的卡片进行查重！

用Excel查重，效率太低。如果文件太大的话比较吃电脑的配置!

思路及代码

用Python自带的TK做成GUI更加普遍好用。

用pandas包来实现数据的查重功能，具体见代码注释！源代码见文后：

查重的相关规则

默认为名字拼音+疾病名称与身份证号+疾病名称的查重的交集（身份证查重的优先级高于名字拼音），如勾选了性别现住址国标的话，则判断重卡的依据就是姓名拼音+性别+现住址+疾病名称为相同则判定为重卡！！其它类似！

其它注意事项：

因为为python打包而来，所以双击之后请稍等（解析速度较慢）！

如果杀毒软件提示，请点击允许允许！

如果双击之后出现找不到什么动态库，如下界面的时候，请安装前往https://www.microsoft.com/zh-... 下载安装VC2015之后在运行

如果安装以上下载文件出错时，请用杀毒软件下载系统更新补丁后重试！

如果在使用过程中有什么疑问的或好的建议的，可以发送邮件到ztwenxing@dingtalk.com（有时间的会回复）

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pinyin import pinyin
from tkinter import Tk, filedialog, Canvas, messagebox, StringVar, IntVar
from tkinter import Label, Entry, Button, Checkbutton
# from PIL import ImageTk, Image
from pandas import DataFrame, read_csv, Series, to_datetime

##设置窗口界面
window = Tk()
# 设置标题
window.title("大疫情查重用")
# 设置窗口大小
window.geometry("700x395")

# frame=Frame(window)
# frame.pack()
# 设置背景图片
canvas = Canvas(window, width=700, height=395, bd=0, highlightthickness=0)
# imgpath = "giphy.gif"

# 设置背景图片
# img = Image.open("C:/Users/Administrator/chachong/ztcdc3.png")
# photo = ImageTk.PhotoImage(image=img)
width = 700
height = 395

# 画布设置数值为中心点的数值
# canvas.create_image(width/2, height/2, image=photo)
canvas.create_image(width / 2, height / 2)
canvas.pack()
# 设置标签1
##设置输入界面
#
label_text = Label(window, text="此小程序主要用大疫情网络的疫情卡片查重！！！")
# label_text.grid(row=0,column= 1)


# 设置高度的等分比例
height_x = height / (height / 39.5)
col_num = 1
canvas.create_window(width * (9 / 18), height_x * col_num, window=label_text)
col_num = 2


def choiceFileCallBack():
    # 选择文件
    filenames = filedialog.askopenfilenames(filetypes=[("csv文件", "*.csv")])
    # filenames = filedialog.askopenfilenames()
    if len(filenames) != 0:
        if str(u"csv") in filenames[0]:
            en_text.set("")
            string_filename = ""
            string_filename = str(filenames[0])
            # 设置Lb1的属性
            en_text.set(string_filename)
            # 设置bt3的激活属性
            button3.configure(state="active")
        else:
            en_text.set("")
            messagebox.showinfo("请选择csv文件格式", "未选择csv格式文件，请重新选择！！")
            button3.configure(state="disabled")
    else:
        en_text.set("")
        messagebox.showinfo("未选择", "未选择文件,请选择")
        # 设置bt3的激活属性
        button3.configure(state="disabled")


label1 = Label(window, text="需要读取的文件路径：", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label1)
en_text = StringVar()  # 绑定listbox的列表值
en_text.set((""))
entry = Entry(window, textvariable=en_text)
canvas.create_window(width * (9 / 18), height_x * col_num, width=360, window=entry)

button1 = Button(window, text="选择", command=choiceFileCallBack)
canvas.create_window(width * (16 / 18), height_x * col_num, width=80, height=30, window=button1)


# 选择保存路径
def choiceSaveCallBack():
    # 选择文件

    filename = filedialog.asksaveasfilename(filetypes=[("csv文件", "*.csv")])

    if filename != "":
        ##判断是否为csv格式的文件
        en1_text.set("")
        string_filename = ""
        string_filename = str(filename) + ".csv"
        # 设置Lb1的属性
        en1_text.set(string_filename)
        # 设置bt3的激活属性
        button3.configure(state="active")

    else:
        en1_text.set("")
        # 设置bt3的激活属性
        messagebox.showinfo("未选择", "未选择保存路径请选择！")
        button3.configure(state="disabled")




col_num = 3
label2 = Label(window, text="查重的保存路径及文件名：", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label2)
en1_text = StringVar()  # 绑定listbox的列表值
en1_text.set((""))
entry1 = Entry(window, textvariable=en1_text)
canvas.create_window(width * (9 / 18), height_x * col_num, width=360, window=entry1)
button2 = Button(window, text="选择", command=choiceSaveCallBack)
canvas.create_window(width * (16 / 18), height_x * col_num, width=80, height=30, window=button2)

###设置逻辑层
# 点击OK按钮的函数
def hellook():
    # 设置查重list
    name_lists = ("患者姓名", "有效证件号", "性别", "联系电话", "现住地址国标", "疾病名称", "病例分类", "病例分类2")
    check_lists = [CheckVar1.get(), CheckVar2.get(), CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(),
                   CheckVar7.get(), CheckVar8.get()]
    check_list_pd = DataFrame({"name_lists": name_lists, "check_lists": check_lists})
    check_list = check_list_pd[check_list_pd["check_lists"] == 1]["name_lists"]
    check_list = check_list.values.tolist()
    print(check_list)
    try:
        rc_data = read_csv(filepath_or_buffer=entry.get(), encoding="GB18030")
    except:
        open_error = messagebox.showinfo(title="unfortunately ", message="打开文件出错，请检查！")
        messagebox.showinfo(title="unfortunately ", message="打开文件出错，请检查！")
        print(open_error)

    ### 拼音转换函数
    def hanzi2pinyin(sr, *args, **kwargs):
        list = []
        for i in sr:
            list.append(pinyin.get(i, format="strip", delimiter=""))
        return list

    ##重卡生成函数

    def shengcheng_chongka(data, checklist, aeslist="报告卡录入时间"):
        # data为查重的列,chcklist为查重列的合并list
        # 合并对应的列
        colwx = DataFrame(Series([""] * data.__len__()).str.cat(data[checklist], na_rep="_"))
        colwx = colwx.rename(columns={0: "chachong"})
        # 生成查重的数据格式
        chachong_data_sf = data.reset_index(drop=True)
        chachong_data_sf = chachong_data_sf.merge(colwx, left_index=True, right_index=True)
        chachong_data_sf = chachong_data_sf.sort_values(by=["chachong", aeslist])
        chachong_data_sf = chachong_data_sf[chachong_data_sf.duplicated(subset="chachong", keep=False)]
        # 设置Index
        chachong_data_sf = chachong_data_sf.reset_index(drop=True)
        # 生成重复的数据的例数
        chachong_num = chachong_data_sf["chachong"].value_counts()
        chachong_num = DataFrame(chachong_num)
        chachong_num = chachong_num.rename(columns={0: "chachong"})
        # 生成重复的数据1
        chachong_data_sf_first = chachong_data_sf.drop_duplicates(subset="chachong", keep="first")
        # 生成重复的数据2
        chachong_data_sf_last = chachong_data_sf.drop_duplicates(subset="chachong", keep="last")
        chachong_data_sf_last = chachong_data_sf_last.join(chachong_num, on="chachong", lsuffix="_last", rsuffix="_f")
        # 合并重复列
        # 合并为最后的数据
        zong_sf = chachong_data_sf_last.join(chachong_data_sf_first.set_index("chachong"), on="chachong_last",
                                             lsuffix="_last", rsuffix="_f")
        zong_sf = zong_sf.drop(columns=["name_last", "chachong_last", "name_f"])
        zong_sf = zong_sf.rename(columns={"chachong_f": "重复卡片数"})
        return zong_sf

    ### 读取数据
    # rc_data=read_csv(r"C:/Users/Administrator/Desktop/2015010120181231#reprot.csv",encoding="GB18030")
    rc_data1 = rc_data.copy()
    rc_data1["name"] = hanzi2pinyin(rc_data1["患者姓名"])
    rc_data1["现住地址国标"] = rc_data1["现住地址国标"].map(str)
    rc_data1["报告卡录入时间"] = to_datetime(rc_data1["报告卡录入时间"])

    ## 根据checkbox选择对应的数据列查重
    name_lists = ["name", "有效证件号", "性别", "联系电话", "现住地址国标", "疾病名称", "病例分类", "病例分类2"]
    # check_lists = [CheckVar1.get(), CheckVar2.get(), CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(),CheckVar7.get(), CheckVar8.get()]
    # 身份证选择范围列名字默认为空
    check_lists_sf = [0, 1, CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(), CheckVar7.get(),
                      CheckVar8.get()]
    check_list_pd_sf = DataFrame({"name_lists": name_lists, "check_lists": check_lists_sf})
    check_list_sf = check_list_pd_sf[check_list_pd_sf["check_lists"] == 1]["name_lists"].values.tolist()
    ###排除身份证为空的为数据
    chachong_data_sf = rc_data1.dropna(subset=["有效证件号"])
    zong_sf = shengcheng_chongka(data=chachong_data_sf, checklist=check_list_sf)
    # 通过姓名加其它条件查重
    check_lists_nm = [1, 0, CheckVar3.get(), CheckVar4.get(), CheckVar5.get(), CheckVar6.get(), CheckVar7.get(),
                      CheckVar8.get()]
    check_list_pd_nm = DataFrame({"name_lists": name_lists, "check_lists": check_lists_nm})
    check_list_nm = check_list_pd_nm[check_list_pd_nm["check_lists"] == 1]["name_lists"].values.tolist()
    zong_nm = shengcheng_chongka(data=rc_data1, checklist=check_list_nm)
    ##两个数据的合并
    zong = zong_sf.append(zong_nm)
    zong = zong.drop_duplicates(subset="卡片编号_last", keep="first")
    zong = zong.sort_values(by=["报告单位地区编码_last", "报告卡录入时间_last"], ascending=False)

    try:
        zong_sf.to_csv(entry1.get(), index=False, encoding="GB18030")
        infomessage = "查重完毕！文件保存在{}".format(entry1.get())
        messagebox.showinfo(title="unfortunately ", message=infomessage)
    except:
        # save_error = messagebox.showinfo(title="unfortunately ", message="保存文件出错，请检查！")
        messagebox.showinfo(title="unfortunately ", message="保存文件出错，请检查!")



col_num = 4
button3 = Button(window, text="OK", command=hellook)
canvas.create_window(width * (9 / 18), height_x * col_num, width=80, height=30, window=button3)
col_num = 5
# 基础信息
label3 = Label(window, text="基础信息：", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label3)
# 创建选择查重的条件
col_num = 5.8
CheckVar1 = IntVar()
CheckVar2 = IntVar()
CheckVar3 = IntVar()
CheckVar4 = IntVar()
CheckVar5 = IntVar()
C1 = Checkbutton(window, text="姓     名", variable=CheckVar1, onvalue=1, offvalue=0, height=5, width=20,
                 state="disabled")
C1.select()
C2 = Checkbutton(window, text="有效证件号", variable=CheckVar2, onvalue=1, offvalue=0, height=5, width=20, state="disabled")
C2.select()
C3 = Checkbutton(window, text="性     别", variable=CheckVar3, onvalue=1, offvalue=0, height=5, width=20)
C3.select()
C4 = Checkbutton(window, text="联系电话", variable=CheckVar4, onvalue=1, offvalue=0, height=5, width=20)
C4.select()
C5 = Checkbutton(window, text="现住地址国标", variable=CheckVar5, onvalue=1, offvalue=0, height=5, width=20)
C5.select()
# C1.select()
canvas.create_window(width * (2 / 18), height_x * col_num, width=80, height=30, window=C1)
canvas.create_window(width * (5 / 18), height_x * col_num, width=80, height=30, window=C2)
canvas.create_window(width * (8 / 18), height_x * col_num, width=80, height=30, window=C3)
canvas.create_window(width * (11 / 18), height_x * col_num, width=80, height=30, window=C4)
canvas.create_window(width * (14 / 18), height_x * col_num, width=100, height=30, window=C5)

col_num = 6.4
label4 = Label(window, text="疾病信息：", )
canvas.create_window(width * (2 / 18), height_x * col_num, width=170, window=label4)
col_num = 7.2
CheckVar6 = IntVar()
CheckVar7 = IntVar()
CheckVar8 = IntVar()

C6 = Checkbutton(window, text="疾病名称", variable=CheckVar6, onvalue=1, offvalue=0, height=5, width=20, state="disabled")
C6.select()
# C3.select()
C7 = Checkbutton(window, text="病例分类", variable=CheckVar7, onvalue=1, offvalue=0, height=5, width=20)
# C4.select()
C8 = Checkbutton(window, text="病例分类2", variable=CheckVar8, onvalue=1, offvalue=0, height=5, width=20)

canvas.create_window(width * (2 / 18), height_x * col_num, width=80, height=30, window=C6)
canvas.create_window(width * (5 / 18), height_x * col_num, width=80, height=30, window=C7)
canvas.create_window(width * (8 / 18), height_x * col_num, width=80, height=30, window=C8)

col_num = 8
label4 = Label(window, text="问题反馈：ztwenxing@dingtalk.com             源码及说明：https://segmentfault.com/a/1190000018570381", )
canvas.create_window(width * (9 / 18), height_x * col_num,  window=label4)




window.mainloop()

python 查重 python查重代码疫情大数据 python

文章版权归作者所有，未经允许请勿转载,若此文章存在违规行为，您可以联系管理员删除。

转载请注明本文地址：https://www.ucloud.cn/yun/43414.html

上海疫情清零-UCloud有话说！

U事记1. UCloud优刻得数字力量助力老年人跨越数字鸿沟。2.杨浦区委副书记、区长薛侃走访调研UCloud优刻得疫情防控和复产复工工作。3.UCloud优刻得中标特变电工德阳、新疆两地电缆公司无人值守地磅采购项目。4.UCloud优刻得联合中国移动，打造山东枣庄高新区智慧城市云平台。5.紫鸟浏览器使用GlobalSSH畅联全球，安全高效管理云端跨境店铺。6. UCloud优刻得数字哨兵护卫陆家...

ernest.wang 2022-06-30 18:36 评论0 收藏0
自动化测试工具

摘要：自己写一个程序来自动生成测试数据，因为每个个人作业的要求不一样，自动化框架无法对每种程序都生成测试数据，目前只支持生成按规则生成随机的字符串测试集。作者：Grey...

tuniutech 2021-10-11 10:57 评论0 收藏0
python综合程序设计——做一个可视化大屏

摘要：完成可视化热搜榜和国内疫情新增图，提高学生的编程能力和分析问题解决问题的能力。下图为百度微博知乎三大平台的热搜词频统计图。后续我会补上薄弱项，为争取做一名全栈技术人员而奋斗。 ...

_Dreams 2021-11-11 16:55 评论0 收藏0
为什么这么多应届生要进入互联网行业？

摘要：互联网行业薪资普遍高于其他行业拉勾网数据显示，年以来，互联网行业的年度平均薪资稳步增长，年度薪资较年同比增长，开年薪资同比去年增长。智联招聘的一组数据显示，20...

AZmake 2021-10-11 10:58 评论0 收藏0