參考代碼 ChineseClean_demo1.py: # -*- coding:utf-8 -*- import xlrd import xlwt ''' python3.4 ''' # file 表示源文件名字,修改此處便可 file="./data/answer_detail_5_15307860968687.xls" dirs="./result" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #建立工做簿 ''' 建立第一個sheet: sheet1 ''' sheet1 = f.add_sheet(u'sheet1_1',cell_overwrite_ok=True) #建立sheet sheet2 = f.add_sheet(u'sheet1_2',cell_overwrite_ok=True) #建立sheet row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] # 打開文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(0) # sheet索引從0開始 cols = sheet0.col_values(cols_numb) rows_list_1=[] rows_list_2=[] for i in range(1,len(cols)): if cols[i] == '0': rows_list_1.append(i) else: rows_list_2.append(i) for i in range(0,len(row0)): sheet1.write(0,i,row0[i]) sheet2.write(0,i,row0[i]) ''' sheet1_1保存0分數據 ''' for j in range(0,len(rows_list_1)): rows = sheet0.row_values(rows_list_1[j]) # 獲取行內容 for i in range(0,len(rows)): sheet1.write(j+1,i,rows[i]) ''' sheet1_2保存非0分數據 ''' for j in range(0,len(rows_list_2)): rows = sheet0.row_values(rows_list_2[j]) # 獲取行內容 for i in range(0,len(rows)): sheet2.write(j+1,i,rows[i]) f.save('./data/demo1.xls') #保存文件 if __name__ == '__main__': # 讀取文件的行和列 rows_numb=0 cols_numb=6 read_excel(rows_numb,cols_numb) ChineseClean_demo2.py: # -*- coding:utf-8 -*- import xlrd import xlwt ''' python3.4 ''' # file 表示源文件名字,修改此處便可 file="./data/demo1.xls" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #建立工做簿 ''' 建立第sheet: ''' sheet1 = f.add_sheet(u'sheet2_1',cell_overwrite_ok=True) #建立sheet sheet2 = f.add_sheet(u'sheet2_2',cell_overwrite_ok=True) #建立sheet sheet3 = f.add_sheet(u'sheet2_3',cell_overwrite_ok=True) #建立sheet sheet4 = f.add_sheet(u'sheet2_4',cell_overwrite_ok=True) #建立sheet row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] for i in range(0,len(row0)): sheet1.write(0,i,row0[i]) sheet2.write(0,i,row0[i]) sheet3.write(0,i,row0[i]) sheet4.write(0,i,row0[i]) # 打開文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(0) # sheet索引從0開始 cols = sheet0.col_values(cols_numb) # 獲取列內容 rows_list_1=[] rows_list_2=[] rows_list_3=[] rows_list_4=[] for i in range(1,len(cols)): if float(cols[i]) < 12.0: rows_list_1.append(i) if float(cols[i]) >= 12.0 and float(cols[i]) < 16.0: rows_list_2.append(i) if float(cols[i]) >= 16.0 and float(cols[i]) < 18.0: rows_list_3.append(i) if float(cols[i]) >= 18.0: print(i) print(type(cols[i])) exit() rows_list_4.append(i) ''' sheet2_1保存差,小於12分 ''' for j in range(0,len(rows_list_1)): rows = sheet0.row_values(rows_list_1[j]) # 獲取行內容 for i in range(0,len(rows)): sheet1.write(j+1,i,rows[i]) ''' sheet2_2保存中,大於等於12,且小於16分 ''' for j in range(0,len(rows_list_2)): rows = sheet0.row_values(rows_list_2[j]) # 獲取行內容 for i in range(0,len(rows)): sheet2.write(j+1,i,rows[i]) ''' sheet2_3保存良,大於等於16,且小於18分 ''' for j in range(0,len(rows_list_3)): rows = sheet0.row_values(rows_list_3[j]) # 獲取行內容 for i in range(0,len(rows)): sheet3.write(j+1,i,rows[i]) ''' sheet2_4保存優,大於等於18分 ''' for j in range(0,len(rows_list_4)): rows = sheet0.row_values(rows_list_4[j]) # 獲取行內容 for i in range(0,len(rows)): sheet4.write(j+1,i,rows[i]) f.save('./data/demo2.xls') if __name__ == '__main__': # 讀取文件的行和列 rows_numb=0 cols_numb=6 read_excel(rows_numb,cols_numb) ChineseClean_demo3.py: # -*- coding:utf-8 -*- import xlrd import xlwt ''' python3.4 ''' file="./data/answer_detail_5_15307860968687.xls" def read_excel(rows_numb,cols_numb): f = xlwt.Workbook() #建立工做簿 ''' 建立第一個sheet: sheet1 ''' sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #建立sheet sheet2 = f.add_sheet(u'sheet2',cell_overwrite_ok=True) #建立sheet sheet3 = f.add_sheet(u'sheet3',cell_overwrite_ok=True) #建立sheet sheet4 = f.add_sheet(u'sheet4',cell_overwrite_ok=True) #建立sheet sheet5 = f.add_sheet(u'sheet5',cell_overwrite_ok=True) row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID'] for i in range(0,len(row0)): sheet1.write(0,i,row0[i]) sheet2.write(0,i,row0[i]) sheet3.write(0,i,row0[i]) sheet4.write(0,i,row0[i]) sheet5.write(0,i,row0[i]) # 打開文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(0) # sheet索引從0開始 cols = sheet0.col_values(cols_numb) # 獲取列內容 rows_list_1=[] rows_list_2=[] rows_list_3=[] rows_list_4=[] rows_list_5=[] for i in range(1,len(cols)): if cols[i] == '100012': rows_list_1.append(i) if cols[i] == '100014': rows_list_2.append(i) if cols[i] == '100007': rows_list_3.append(i) if cols[i] == '100016': rows_list_4.append(i) if cols[i] == '100017': print(i) print(type(cols[i])) rows_list_5.append(i) ''' sheet1保存 ''' for j in range(0,len(rows_list_1)): rows = sheet0.row_values(rows_list_1[j]) # 獲取第四行內容 for i in range(0,len(rows)): sheet1.write(j+1,i,rows[i]) ''' sheet2保存 ''' for j in range(0,len(rows_list_2)): rows = sheet0.row_values(rows_list_2[j]) # 獲取第四行內容 for i in range(0,len(rows)): sheet2.write(j+1,i,rows[i]) ''' sheet3保存 ''' for j in range(0,len(rows_list_3)): rows = sheet0.row_values(rows_list_3[j]) # 獲取第四行內容 for i in range(0,len(rows)): sheet3.write(j+1,i,rows[i]) ''' sheet4保存 ''' for j in range(0,len(rows_list_4)): rows = sheet0.row_values(rows_list_4[j]) # 獲取第四行內容 for i in range(0,len(rows)): sheet4.write(j+1,i,rows[i]) ''' sheet5保存 ''' for j in range(0,len(rows_list_5)): rows = sheet0.row_values(rows_list_5[j]) # 獲取第四行內容 for i in range(0,len(rows)): sheet5.write(j+1,i,rows[i]) f.save('./data/demo3.xls') #保存文件 if __name__ == '__main__': # 讀取文件的行和列 rows_numb=0 cols_numb=7 read_excel(rows_numb,cols_numb) ChineseClean_demo4or5.py: 同ChineseClean_demo3.py ChineseClean_answer_QA.py: # -*- coding:utf-8 -*- import re import xlrd file="./data/demo5.xls" dirs="./result" def read_excel(rows_numb,cols1_numb): number='1' f2 = open(dirs+'./demo5_sheet1_%s.csv'%number, 'a', encoding='utf-8') # 打開文件 workbook = xlrd.open_workbook(file) sheet0 = workbook.sheet_by_index(int(number)-1) # sheet索引從0開始 cols1 = sheet0.col_values(cols1_numb[3]) [1:]# 獲取列內容 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+" pattern1 = re.compile(p1) for i in range(len(cols1)): matcher1 = re.findall(pattern1, cols1[i]) str1=str() if matcher1: str1 = ' '.join(matcher1) f2.write(str1) f2.write('\n') f2.close() if __name__ == '__main__': # 讀取文件的行和列 rows_numb=0 cols1_numb=[0,1,2,3,4,5,6,7] read_excel(rows_numb,cols1_numb) qa_test_clean_word.py: # -*- coding: utf-8 -*- import jieba # 建立停用詞list def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords # 對句子進行分詞 def seg_sentence(sentence): sentence_seged = jieba.lcut_for_search(sentence.strip(),HMM=True) stopwords = stopwordslist('./test/stopwords.txt') # 這裏加載停用詞的路徑 outstr = '' for word in sentence_seged: if word not in stopwords: if word != '\t': outstr += word outstr += " " return(outstr) inputs = open('./data/demo5_answer_csv/demo5_sheet5_5.csv', 'r', encoding='utf-8') outputs = open('./test/demo5_sheet5_5_5.csv', 'w') for line in inputs: line_seg = seg_sentence(line) try: if len(line_seg): outputs.write(line_seg + '\n') except: pass outputs.close() inputs.close() word_fre.py: # -*- coding: utf-8 -*- import matplotlib.pyplot as plt from matplotlib.font_manager import * import numpy as np def drawStatBarh(): ''' 畫出詞頻統計條形圖,用漸變顏色顯示,選取前N個詞頻 ''' fig, ax = plt.subplots() myfont = FontProperties(fname='./data/simfang.ttf') N = 30 words = [] counts = [] for line in open('./data/word_fre.txt'): if line == '\n': continue line.strip('\n') words.append(line.split(' ')[0]) print(line.split(' ')[0]) # exit() counts.append(int(line.split(' ')[1].strip('\n'))) y_pos = np.arange(N) colors = ['#FA8072'] #這裏是爲了實現條狀的漸變效果,以該色號爲基本色實現漸變效果 for i in range(len(words[:N]) - 1): colors.append('#FA' + str(int(colors[-1][3:]) - 1)) rects = ax.barh(y_pos, counts[:N], align='center', color=colors) ax.set_yticks(np.arange(N)) ax.set_yticklabels(words[:N],fontproperties=myfont) ax.invert_yaxis() # labels read top-to-bottom ax.set_title('報告中的高頻詞彙',fontproperties=myfont, fontsize=17) ax.set_xlabel(u"出現次數",fontproperties=myfont) autolabel(rects, ax) plt.show() def autolabel(rects, ax): """ 給條形圖加上文字標籤 """ #fig, ax = plt.subplots() for rect in rects: width = rect.get_width() ax.text(1.03 * width, rect.get_y() + rect.get_height()/2., '%d' % int(width),ha='center', va='center') def wordCount(segment_list): ''' 該函數實現詞頻的統計,並將統計結果存儲至本地。 在製做詞雲的過程當中用不到,主要是在畫詞頻統計圖時用到。 ''' word_lst = [] word_dict = {} with open('./data/word_fre.txt','w') as wf2: word_lst.append(segment_list.split(' ')) for item in word_lst: for item2 in item: if item2 not in word_dict: word_dict[item2] = 1 else: word_dict[item2] += 1 # print(type(word_dict)) # print(word_dict) word_dict_sorted =list(sorted(word_dict.items(),key = lambda jj:jj[1],reverse=True))#list是關鍵,按照詞頻從大到小排序 # word_dict_sorted = dict(sorted(word_dict.items(),key = lambda item:item[1], reverse=True))#按照詞頻從大到小排序 print(word_dict_sorted) # exit() for tup in word_dict_sorted: # print(type(tup)) # print(tup) # exit() if tup[0] != '': wf2.write(tup[0].strip('\n')+' '+str(tup[1])+'\n') wf2.close() if __name__ == "__main__": segment_list_remove_stopwords=open('./data/demo5_sheet5_1_1.csv').read() wordCount(segment_list_remove_stopwords) drawStatBarh() wordcloud_test2.py: # - * - coding: utf - 8 -*- from os import path from scipy.misc import imread import matplotlib.pyplot as plt import jieba # jieba.load_userdict("txt\userdict.txt") # 添加用戶詞庫爲主詞典,原詞典變爲非主詞典 from wordcloud import WordCloud, ImageColorGenerator # 獲取當前文件路徑 # __file__ 爲當前文件, 在ide中運行此行會報錯,可改成 # d = path.dirname('.') d = path.dirname(__file__) stopwords = {} isCN = 1 #默認啓用中文分詞 back_coloring_path = "data/lz1.jpg" # 設置背景圖片路徑 text_path = 'data/demo5_sheet5_1_1.csv' #設置要分析的文本路徑,講原始文件轉化爲‘ANSI編碼便可’ font_path = 'data/simfang.ttf' # 爲matplotlib設置中文字體路徑 stopwords_path = 'data/stopwords.txt' # 停用詞詞表 imgname1 = "data/WordCloudDefautColors.png" # 保存的圖片名字1(只按照背景圖片形狀) imgname2 = "data/WordCloudColorsByImg.png"# 保存的圖片名字2(顏色按照背景圖片顏色佈局生成) # my_words_list = ['CHENGLEI'] # 在結巴的詞庫中添加新詞 back_coloring = imread(path.join(d, back_coloring_path))# 設置背景圖片 # 設置詞雲屬性 wc = WordCloud(font_path=font_path, # 設置字體 background_color="white", # 背景顏色 max_words=2000, # 詞雲顯示的最大詞數 mask=back_coloring, # 設置背景圖片 max_font_size=100, # 字體最大值 random_state=42, width=1000, height=860, margin=2,# 設置圖片默認的大小,可是若是使用背景圖片的話,那麼保存的圖片大小將會按照其大小保存,margin爲詞語邊緣距離 ) # 添加本身的詞庫分詞 # def add_word(list): # for items in list: # jieba.add_word(items) # add_word(my_words_list) text = open(path.join(d, text_path)).read() # def jiebaclearText(text): # mywordlist = [] # seg_list = jieba.cut(text, cut_all=False) # liststr="/ ".join(seg_list) # f_stop = open(stopwords_path) # try: # f_stop_text = f_stop.read( ) # f_stop_text=unicode(f_stop_text,'utf-8') # finally: # f_stop.close( ) # f_stop_seg_list=f_stop_text.split('\n') # for myword in liststr.split('/'): # if not(myword.strip() in f_stop_seg_list) and len(myword.strip())>1: # mywordlist.append(myword) # return ''.join(mywordlist) # # if isCN: # text = jiebaclearText(text) # 生成詞雲, 能夠用generate輸入所有文本(wordcloud對中文分詞支持很差,建議啓用中文分詞),也能夠咱們計算好詞頻後使用generate_from_frequencies函數 wc.generate(text) # wc.generate_from_frequencies(text) # txt_freq例子爲[('詞a', 100),('詞b', 90),('詞c', 80)] # 從背景圖片生成顏色值 image_colors = ImageColorGenerator(back_coloring) plt.figure() # 如下代碼顯示圖片 plt.imshow(wc) plt.axis("off") plt.show() # 繪製詞雲 # 保存圖片 wc.to_file(path.join(d, imgname1)) image_colors = ImageColorGenerator(back_coloring) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") # 繪製背景圖片爲顏色的圖片 plt.figure() plt.imshow(back_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show() # 保存圖片 wc.to_file(path.join(d, imgname2)) lda_test_ok.py: # coding=utf-8 import numpy as np import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer import lda def doc_topic_word(): print(doc_topic[:, :3])#輸出文檔主題分佈狀況(前3列) print(topic_word[:, :3])#輸出主題詞分佈狀況(前3列),採用ifidf計算詞頻 #導出分佈圖 def plot_1(): # 計算各個主題中單詞權重分佈的狀況 f, ax= plt.subplots(2, 1, figsize=(6, 6), sharex=True) for i, k in enumerate([0, 9]): #任意選擇兩個主題 ax[i].stem(topic_word[k,:], linefmt='b-', markerfmt='bo', basefmt='w-') ax[i].set_xlim(-2,2000) ax[i].set_ylim(0, 1) ax[i].set_ylabel("Prob") ax[i].set_title("topic {}".format(k)) ax[1].set_xlabel("word") plt.tight_layout() plt.show() def plot_2(): # 計算文檔具體分佈在那個主題,代碼以下所示: f, ax= plt.subplots(2, 1, figsize=(8, 8), sharex=True) for i, k in enumerate([0,9]): #任意選擇兩個主題 ax[i].stem(doc_topic[k,:], linefmt='r-', markerfmt='ro', basefmt='w-') ax[i].set_xlim(-1, 20) #x座標下標,即主題的取值範圍 ax[i].set_ylim(0, 1.2) #y座標下標 ax[i].set_ylabel("Prob") ax[i].set_title("Document {}".format(k)) ax[1].set_xlabel("Topic") plt.tight_layout() plt.show() if __name__ == "__main__": #存儲讀取語料 一行預料爲一個文檔 corpus = [] for line in open('./data/demo5_sheet5_1_1.csv', 'r').readlines(): corpus.append(line.strip()) #將文本中的詞語轉換爲詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文本下的詞頻 vectorizer = CountVectorizer() print (vectorizer) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() weight = X.toarray() print("type(X): {}".format(type(X))) print("shape: {}\n".format(X.shape)) print (len(weight)) print (weight[:5, :5]) #LDA算法 print ('LDA:') model = lda.LDA(n_topics=20, n_iter=50, random_state=1) # model.fit_transform(X) model.fit(np.asarray(weight)) # model.fit_transform(X) is also available? topic_word = model.topic_word_ # model.components_ also works #文檔-主題(Document-Topic)分佈 doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) #輸出前10篇文章最可能的Topic label = [] for n in range(10): topic_most_pr = doc_topic[n].argmax() label.append(topic_most_pr) print("doc: {} topic: {}".format(n, topic_most_pr)) #輸出主題中的TopN關鍵詞 word = vectorizer.get_feature_names() n = 6 for i, topic_dist in enumerate(topic_word): topic_words = np.array(word)[np.argsort(topic_dist)][:-(n+1):-1] print(u'*Topic {}\n- {}'.format(i, ' '.join(topic_words))) # doc_topic_word() # plot_1() plot_2() gensimTopicTest0803.py: # coding=utf-8 import re import xlrd import codecs import jieba from gensim import corpora, models, similarities FILE="demo5"#選擇要訓練的文件 ID='1'#選擇要訓練的能力ID # 讀取停用詞表 stopwords = [line.strip() for line in codecs.open('./data/stopwords.txt', 'r', encoding='utf-8').readlines()] def cleanAnswer(cols_numb): f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8') # 打開文件 workbook = xlrd.open_workbook('./data/%s.xls'%FILE) #根據sheet索引或者名稱獲取sheet內容 sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引從0開始 cols1 = sheet0.col_values(cols_numb[3])[1:]# 獲取第三列內容,從第一行開始 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不獲取匹配,即不獲取括號內的匹配,括號內使用UNICODE編碼匹配 pattern1 = re.compile(p1) for i in range(len(cols1)): matcher1 = re.findall(pattern1, cols1[i])#以列表形式返回全部能匹配到的子串 str1=str() if matcher1: str1 =''.join(matcher1) f1.write(str1.strip()) f1.write('\n') f1.close() def ldaAnaly(): print("構造分詞庫-----train-----") #去停用詞,構建分詞庫 train = [] fp = codecs.open('./result/%s_sheet%s.csv'%(FILE,ID),'r',encoding='utf8') for line in fp.readlines(): line = line.strip() if not len(line):#判斷是否爲空行 continue outstr = ' ' seg_list =jieba.cut(line,cut_all=False)#採用精確模式分詞,效果最好 for word in seg_list: if word not in stopwords: if word != '\t': outstr += word outstr += " " train.append(outstr.strip().split(" "))#字符串轉列表 fp.close() print("構造分詞庫,並保存----「dict_v1.dict」----") dic = corpora.Dictionary(train) dic.save('./result/dict_v1.dict') print("保存可讀取的分詞庫----「dic.csv」----") fd = codecs.open('./result/dic.csv', 'a',encoding = 'utf-8') for word,index in dic.token2id.items(): fd.write(word +':'+ str(index)+'\n') fd.close() print("生成語料庫,並保存-----「corpus.mm」-----") corpus = [dic.doc2bow(text) for text in train] corpora.MmCorpus.serialize('./result/corpus.mm', corpus) print("保存tfidf模型-----「corpus.tfidf_model」-----") tfidf = models.TfidfModel(corpus) tfidf.save('./result/corpus.tfidf_model') print("進行LDA主題分析,並保存-----「ldaModel.pkl」-----") #使用tf-idf模型訓練語料庫 corpus_tfidf = tfidf[corpus] #設置100個LDA主題,使用500次迭代 lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=100, iterations=500 ) lda.save('./result/ldaModel.pkl') print("評估文章屬於不一樣主題的機率,一個詞對文章的重要性-----「Demo:評估文章1」-----") for index, score in sorted(lda[corpus_tfidf[0]], key=lambda tup: -1 * tup[1]): print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 10))) # 輸出100個主題 # ldaOut = lda.print_topics(100) # print("默認返回每一個主題的前10的機率最大的詞") # print (ldaOut[0]) # print (ldaOut[1]) # print (ldaOut[2]) # corpus_lda = lda[corpus_tfidf] # print("每篇文章屬於不一樣主題的機率分佈") # k = 0 # for doc in corpus_lda: # print(doc) # k += 1 # if k == 3: # break def questionAnswer(cols_numb, questionNumber): lda = models.LdaModel.load('./result/ldaModel.pkl') dic = corpora.Dictionary.load('./result/dict_v1.dict') corpus = corpora.MmCorpus('./result/corpus.mm') tfidf = models.TfidfModel.load('./result/corpus.tfidf_model') # print("輸入一個問題------------------") f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8') # 打開文件 workbook = xlrd.open_workbook('./data/%s.xls'%FILE) sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引從0開始 cols0 = sheet0.col_values(cols_numb[3])[questionNumber] # 獲取第三列內容,從第一行開始 #對問題進行去亂碼 p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不獲取匹配,即不獲取括號內的匹配,括號內使用UNICODE編碼匹配 pattern1 = re.compile(p1) matcher1 = re.findall(pattern1, cols0)#以列表形式返回全部能匹配到的子串 query=str() if matcher1: query =''.join(matcher1) # print("待預測的問題(去亂碼):", query) #對問題進行分詞 seg_list = jieba.cut(query, cut_all=False) outstr = ' ' for word in seg_list: if word not in stopwords: if word != '\t': outstr += word outstr += " " inputTest=list(outstr.strip().split(" ")) # print("分詞後的問題(去停用詞):", inputTest) #將問題轉成詞袋 query_bow = dic.doc2bow(inputTest) # print("生成的詞袋:", query_bow) #須要對查詢語句進行tfidf轉化 query_tfidf = tfidf[query_bow] lda_vec_tfidf = lda[query_tfidf] # print("問題對應的主題機率(tfidf)", lda_vec_tfidf) # print("預測問題屬於不一樣主題的機率--------------------") #輸出主題機率的代碼 # for index, score in sorted(lda_vec_tfidf, key=lambda tup: -1 * tup[1]): # print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 20))) # print("預測問題與數據庫中的哪些問題類似,並給出類似度排序(tfidf)--------------------") #進行類似性檢索 similarity = similarities.MatrixSimilarity(corpus) #在TFIDF的基礎上,進行類似性檢測。query_lsi須要進行預先處理。先變化爲dow2bow,而後tfidf. lda_vec = lda[query_bow] # sims = similarity[lda_vec] #類似度檢測的詞袋爲no-tfidf sims = similarity[lda_vec_tfidf] #類似度檢測的詞袋爲tfidf #先枚舉出來,後進行排序輸出 listSims = enumerate(sims) sort_sims = sorted(listSims, key=lambda item: -item[1]) # print(sort_sims[0:6])#前n名效果最好 #進行分數預測--版本1--- sort_sims_list = sort_sims[0:6] cols1 = sheet0.col_values(cols_numb[6])[1:]# 獲取第三列內容,從第一行開始 f1.close() #採用百分比形式的加權平均法,實質就是加權平均誤差法 sumCore1 = 0 sumPro = 0 for i in range(len(sort_sims_list)): sumCore1 += float(cols1[sort_sims_list[i][0] - 1]) * sort_sims_list[i][1] # print(cols1[sort_sims_list[i][0] - 1]) sumPro += sort_sims_list[i][1] preCore1 = sumCore1 / sumPro # print("採用加權平均誤差法,預測分數1爲:%s,實際分數爲%s"%(preCore1, cols1[questionNumber-1])) print("保存預測結果----「pre.csv」----") return preCore1, cols1[questionNumber-1], abs(preCore1 - float(cols1[questionNumber-1])) if __name__ == '__main__': cols_numb = [0,1,2,3,4,5,6,7] #讀取文件的列標號 # questionNumber = 124 #待測試的問題號,最大不超過問題總數,主要用於測試 # cleanAnswer(cols_numb) #對數據庫中的問題進行提取,並去亂碼 # ldaAnaly() #對問題進行訓練,生成主題模型 # questionAnswer(cols_numb, questionNumber) #對問題進行預測,給出預測分數 #循環預測的demo fp = codecs.open('./result/pre_v1.csv', 'a', encoding='utf-8') sum = 0 i = 1 count = 0 while( i < 8717 ): questionNumber = i a = questionAnswer(cols_numb, questionNumber) sum += a[2] # print(a, a[2]) # exit() i += 8 count += 1 fp.write(str(i)+":"+str(a) + '\n') fp.close() ave = sum / count print(ave)