提取excel表數據成json格式的以及對圖片重命名

時間 2019-12-04
標籤提取 excel 數據 json 格式以及圖片重命名欄目 Microsoft Office 简体版
原文原文鏈接
開發那邊的需求python
一、功夫熊貓以及阿狸布塔故事集都是屬於劇集的。意思就是有不少集，這裏稱他們爲tv
最下面這幾行第一列沒名字的都是單集的，這裏稱它們爲mv
須要統計全部工做表裏面的數據把tv放一個大的json裏面
把mv放一個大的json裏面json
二、須要檢索圖片名列。而後檢測文件夾裏面是否有對應的圖片。
同時把圖片提取首字母並生成新文件名
好比
功夫熊貓.jpg 會變成 tv_gfxm.jpg
必備古詩.jpg 會變成 mv_gfxm.jpgapp
下面是最終效果圖
這是單個mv的數據和字段ui
下面是一個tvspa
功能實現日誌
這裏沒使用到uuid。原本想把它當作一個key的。後面發現不須要excel
# -*- coding: utf-8 -*-
import xlrd
import uuid
import re,time,json,shutil,os

#臨時存放全部單集
mv_list=[]
#臨時存放劇集，後面調用format_tv方法存成tv_dict
tv_list=[]

#tv_dict是全部劇集，key是首字母縮寫，值是列表，列表每一個元素是劇集的一行
tv_dict={}

#圖片集合，後期用來給圖片重命名使用
pic_list=[]

exec_log=open('執行日誌.log','a',encoding='utf-8')# 追加模式
error_log=open('錯誤日誌.log','a',encoding='utf-8')# 追加模式
#獲取excel表中須要提取的視頻文件名列表,返回值是個excel句柄
def OpenExcel(excel_file):
    try:
        dataHandle = xlrd.open_workbook(excel_file)
        return dataHandle
    except Exception as ex:
        print(str(ex))
        error_log.write('打開excel表失敗，可能---'+excel_file+'---並不存在\n')
        error_log.close()
        exit(1)

#讀取excel數據
def ReadExcel(excel_file):
    workbook=OpenExcel(excel_file)
    exec_log.write("正在讀取excel表內容")
    print('打印全部sheet:', workbook.sheet_names())
    #工做表個數
    sheet_name_list=workbook.sheet_names()
    sheet_count=len(workbook.sheet_names())
    print("excel表sheet個數:",sheet_count)
    exec_log.write("準備循環excel每一個工做表..\n")
    for i in range(sheet_count):
        current_sheet = workbook.sheet_by_index(i)  # sheet索引從0開始
        rows_num = current_sheet.nrows
        cols_num = current_sheet.ncols
        print("當前sheet的名字是:%s  行數:%s   列數:%s:"%(sheet_name_list[i],rows_num,cols_num))
        print("#####################################################")
        for r in range(1,rows_num):
            # 一行數據的實體
            entity_dict = {}
            for c in range(0,cols_num):
                cell_value=get_value_and_get_int(current_sheet,r,c)
                #這裏若是單元格內容爲空或者是None的話，再次判斷是是否屬於合併單元格
                if (cell_value is None or cell_value == ''):
                    cell_value = (get_merged_cells_value(current_sheet, r, c))
                the_key = 'colnum' + str(c + 1)
                entity_dict[the_key] = cell_value
            #第7列判斷是否有空格
            if entity_dict["colnum7"] is None or entity_dict['colnum7'] == '':
                error_log.write("遇到圖片所在列爲空值的狀況，沒法對空值處理，格式異常位置爲,3秒後退出\n")
                exec_log.write("當前sheet的名字是:%s  行數:%s "%(sheet_name_list[i],r))
                exec_log.close()
                print("遇到圖片所在列爲空值的狀況，沒法對空值處理，格式異常位置爲,3秒後退出")
                print("當前sheet的名字是:%s  行數:%s "%(sheet_name_list[i],r))
                time.sleep(3)
                exit(1)
            #第7列去掉空格，由於要把圖片轉成首字母
            entity_dict["colnum7"].replace(' ','')
            if entity_dict['colnum1'] is None or entity_dict['colnum1'] == '':
                mv_list.append(entity_dict)
            else:
                tv_list.append(entity_dict)
    exec_log.write("循環全部工做表完畢，已經追加到單集列表和劇集列表..暫未生成圖片列\n")


#處理單元格值中的int類型，由於xlrd模塊會把int自動轉成了float，再改回去
def get_value_and_get_int(sheet,r,c):
    cell_value = sheet.row_values(r)[c]
    # 這因爲xlrd會把int類型自動轉存float,這裏作一個處理，把它再轉回int類型
    cell_type = sheet.cell(r,c).ctype  # 表格的數據類型
    if cell_type == 2 and cell_value % 1 == 0.0:  # ctype爲2且爲浮點
        cell_value = int(cell_value)  # 浮點轉成整型
    return cell_value

#找到全部合併單元格的座標（上下左右）
def get_merged_cells(sheet):
    """
    獲取全部的合併單元格，格式以下：
    [(4, 5, 2, 4), (5, 6, 2, 4), (1, 4, 3, 4)]
    (4, 5, 2, 4) 的含義爲：行 從下標4開始，到下標5（不包含）  列 從下標2開始，到下標4（不包含），爲合併單元格
    :param sheet:
    :return:
    """
    return sheet.merged_cells

# 獲取單元格的值
def get_merged_cells_value(sheet, row_index, col_index):
    """
    先判斷給定的單元格，是否屬於合併單元格；
    若是是合併單元格，就返回合併單元格的內容
    :return:
    """
    merged = get_merged_cells(sheet)
    for (rlow, rhigh, clow, chigh) in merged:
        if (row_index >= rlow and row_index < rhigh) and (col_index >= clow and col_index < chigh) :
            cell_value = sheet.cell_value(rlow, clow)
            # print('該單元格[%d,%d]屬於合併單元格，值爲[%s]' % (row_index, col_index, cell_value))
            return cell_value
            # print(cell_value)
    # return None


def getUUID():
    return uuid.uuid1().hex

#去除標點符號
def remove_punctuation(str):
    new_str=re.sub('[^\w\u4e00-\u9fff]+', '',str)
    return new_str

#獲取單個漢字的首字母
def single_get_first(unicode1):
    str1 = unicode1.encode('gbk')
    try:
        ord(str1)
        return str1
    except:
        asc = str1[0] * 256 + str1[1] - 65536
        if asc >= -20319 and asc <= -20284:
            return 'a'
        if asc >= -20283 and asc <= -19776:
            return 'b'
        if asc >= -19775 and asc <= -19219:
            return 'c'
        if asc >= -19218 and asc <= -18711:
            return 'd'
        if asc >= -18710 and asc <= -18527:
            return 'e'
        if asc >= -18526 and asc <= -18240:
            return 'f'
        if asc >= -18239 and asc <= -17923:
            return 'g'
        if asc >= -17922 and asc <= -17418:
            return 'h'
        if asc >= -17417 and asc <= -16475:
            return 'j'
        if asc >= -16474 and asc <= -16213:
            return 'k'
        if asc >= -16212 and asc <= -15641:
            return 'l'
        if asc >= -15640 and asc <= -15166:
            return 'm'
        if asc >= -15165 and asc <= -14923:
            return 'n'
        if asc >= -14922 and asc <= -14915:
            return 'o'
        if asc >= -14914 and asc <= -14631:
            return 'p'
        if asc >= -14630 and asc <= -14150:
            return 'q'
        if asc >= -14149 and asc <= -14091:
            return 'r'
        if asc >= -14090 and asc <= -13119:
            return 's'
        if asc >= -13118 and asc <= -12839:
            return 't'
        if asc >= -12838 and asc <= -12557:
            return 'w'
        if asc >= -12556 and asc <= -11848:
            return 'x'
        if asc >= -11847 and asc <= -11056:
            return 'y'
        if asc >= -11055 and asc <= -10247:
            return 'z'
        return ''

#獲取每一個漢字每一個首字母並返回英文首字母字符串
def getPinyin(str):
    if str==None:
        return None
    str_list = list(str)
    charLst = []
    for item in str_list:
        charLst.append(single_get_first(item))
    return  ''.join(charLst)

#拷貝文件
def copy_file(source_file,target_file):
    if os.path.exists(source_file) and not os.path.exists(target_file):
        shutil.copy(source_file,target_file)
    else:
        error_log.write("下面路徑文件不存在: %s\n"%(source_file))
        print("下面路徑文件不存在: %s"%(source_file))
        time.sleep(0.1)

#處理圖片列表[[pic1,tar1],[pic2,tar2]，去重而且調用拷貝文件方式拷貝爲新文件
def copy_file_from_list(pic_list):
    #對列表去重，這裏沒法使用set，由於set沒法對子元素爲列表的元素作hash
    new_pic_list=[]
    for item in pic_list:
        if item not in new_pic_list:
           new_pic_list.append(item)
    for item in new_pic_list:
        copy_file(item[0],item[1])







#給單集新增長一列key，值是圖片列的英文首字母加上路徑名拼接字符串，好比"excel_pic_dir/mv_hj.jpg" 這種格式
def add_pic_col_for_mv(mv_list):
    exec_log.write("給單集列表生成圖片路徑列excel_pic_dir/mv_hj.jpg 這種格式\n")
    for item in mv_list:
        #格式化漢字字符串，去掉特殊符號
        temp_str=remove_punctuation(item['colnum7'])
        #獲取首字母字符串
        temp_letter=getPinyin(temp_str)
        # print(item['colnum7'])
        # temp_letter=getPinyin(temp_letter)
        # temp_letter=getPinyin(item['colnum7'])
        # print(temp_letter)
        #拼接爲圖片路徑,一個放到圖片集合中給後面重命名使用，一份直接增長到excel行字典中
        source_file="excel_pic_dir/"+temp_str+".jpg"
        target_file="excel_pic_dir/mv_"+temp_letter+".jpg"
        pic_list.append([source_file,target_file])
        # copy_file(source_file,target_file)
        item["pic_path"]=target_file
    exec_log.write("給單集列表生成圖片路徑列完畢\n")



#給劇集新增長一列key，值是圖片列的英文首字母加上路徑名拼接字符串，好比"excel_pic_dir/tv_hj.jpg" 這種格式
def add_pic_col_for_tv(tv_list):
    exec_log.write("給劇集列表生成圖片路徑列excel_pic_dir/mv_hj.jpg 這種格式\n")
    for item in tv_list:
        #格式化漢字字符串，去掉特殊符號
        temp_str=remove_punctuation(item['colnum7'])
        #獲取首字母字符串
        temp_letter=getPinyin(temp_str)
        #拼接爲圖片路徑
        source_file="excel_pic_dir/"+temp_str+".jpg"
        target_file="excel_pic_dir/tv_"+temp_letter+".jpg"
        #拼接爲圖片路徑,一個放到圖片集合中給後面重命名使用，一份直接增長到excel行字典中
        pic_list.append([source_file,target_file])
        # copy_file(source_file,target_file)
        # temp_path="excel_pic_dir/tv_"+temp_letter+".jpg"
        item["pic_path"]=target_file
    exec_log.write("給劇集列表生成圖片路徑列完畢\n")


#把劇集的都放在一個字典中，key是劇集首字母，value是劇集列表
def format_tv(tv_list):
    # tv_dict={}
    exec_log.write("把劇集的都放在一個大字典中，key是劇集首字母，value是劇集列表\n")
    for tv_item in tv_list:
        #先獲取字符串，去掉標點符號
        temp_str=remove_punctuation(tv_item['colnum1'])
        #獲取首字母字符串
        temp_key=getPinyin(temp_str)
        if temp_key in tv_dict:
            tv_dict[temp_key].append(tv_item)
        else:
            tv_dict[temp_key]=[]
            tv_dict[temp_key].append(tv_item)
    exec_log.write("把劇集的都放在一個大字典中完畢\n")

if __name__ == "__main__":
    ReadExcel("001.xlsx")
    # print(mv_list[0])
    add_pic_col_for_mv(mv_list)
    add_pic_col_for_tv(tv_list)
    format_tv(tv_list)
    copy_file_from_list(pic_list)
    print("單集個數:",len(mv_list))
    print("劇集套數:",len(tv_dict))

# print(mv_list[0])
# for k,v in tv_dict.items():
#     print(k,v)
#     break


#獲取單集最終字典列表
programMovieList=[]
for item in mv_list:
    new_item={}
    new_item["programName"]=item["colnum3"]
    new_item["programYear"]=""
    new_item["programType"]=""
    new_item["programDirector"]=""
    new_item["programActor"]=""
    new_item["programPoster"]=item["pic_path"]
    new_item["programIntroduce"]=item["colnum5"]
    new_item["sitnum"]=0
    new_item["code"]=item["colnum2"]
    new_item["cpCode"]=item["colnum6"]
    programMovieList.append(new_item)
exec_log.write("單集最終字典列表獲取完畢\n")

exec_log.write("開始dump---單集---數據到json文件中\n")
with open('mv.json', 'w',encoding='utf-8') as f_mv:
    json.dump(programMovieList, f_mv,ensure_ascii=False,sort_keys=True, indent=4)
exec_log.write("dump---單集---數據完畢\n")
print("dump---單集---數據完畢..")
time.sleep(2)


def get_seriesList(item_list):
    seriesList=[]
    for item in item_list:
        new_item={}
        new_item["programName"]=item["colnum3"]
        new_item["code"]=item["colnum2"]
        seriesList.append(new_item)
    return seriesList
#獲取劇集最終字典列表
programTvList=[]
for item in tv_dict:
    new_item={}
    # print(item)
    new_item["programName"]=tv_dict[item][0]["colnum1"]
    new_item["programYear"]=""
    new_item["programType"]=""
    new_item["programDirector"]=""
    new_item["programActor"]=""
    new_item["programPoster"]=tv_dict[item][0]["pic_path"]
    new_item["programIntroduce"]=tv_dict[item][0]["colnum5"]
    new_item["sitnum"]=len(tv_dict[item])
    new_item["cpCode"]=tv_dict[item][0]["colnum6"]
    new_item["recommend"]=""
    new_item["seriesList"]=get_seriesList(tv_dict[item])
    programTvList.append(new_item)
    # for k,v in new_item.items():
    #     print("\""+k+"\"",v)
#indent參數是縮進的意思，它可使得數據存儲的格式變得更加優雅。
    # with open('data.json', 'w',encoding='utf-8') as f:
    #     json.dump(new_item, f,sort_keys=True, indent=4)
exec_log.write("開始dump---劇集---數據到json文件中\n")
with open('tv.json', 'w',encoding='utf-8') as f_tv:
    json.dump(programTvList, f_tv,ensure_ascii=False,sort_keys=True, indent=4)
exec_log.write("dump---劇集---數據完畢\n")
print("dump---劇集---數據完畢..")
print("程序執行完畢，2秒後退出..")
error_log.close()
exec_log.close()
time.sleep(2)
最後打包成exe交給開發那邊code
把使用說明告訴開發便可orm
運行結果視頻