gensim天然語言處理

參考代碼
ChineseClean_demo1.py:
# -*- coding:utf-8 -*-
import xlrd
import xlwt
'''
python3.4
'''
# file 表示源文件名字,修改此處便可
file="./data/answer_detail_5_15307860968687.xls"
dirs="./result"
 
def read_excel(rows_numb,cols_numb):
 
    f = xlwt.Workbook() #建立工做簿
    '''
    建立第一個sheet:
    sheet1
    '''
    sheet1 = f.add_sheet(u'sheet1_1',cell_overwrite_ok=True) #建立sheet
    sheet2 = f.add_sheet(u'sheet1_2',cell_overwrite_ok=True) #建立sheet
    row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID']
 
    # 打開文件
    workbook = xlrd.open_workbook(file)
    sheet0 = workbook.sheet_by_index(0) # sheet索引從0開始
    cols = sheet0.col_values(cols_numb)
    rows_list_1=[]
    rows_list_2=[]
    for i in range(1,len(cols)):
        if cols[i] == '0':
            rows_list_1.append(i)
        else:
            rows_list_2.append(i)
 
    for i in range(0,len(row0)):
        sheet1.write(0,i,row0[i])
        sheet2.write(0,i,row0[i])
    '''
    sheet1_1保存0分數據
 
    '''
    for j in range(0,len(rows_list_1)):
        rows = sheet0.row_values(rows_list_1[j]) # 獲取行內容
        for i in range(0,len(rows)):
            sheet1.write(j+1,i,rows[i])
    '''
 
    sheet1_2保存非0分數據
 
    '''
    for j in range(0,len(rows_list_2)):
        rows = sheet0.row_values(rows_list_2[j]) # 獲取行內容
        for i in range(0,len(rows)):
            sheet2.write(j+1,i,rows[i])
 
    f.save('./data/demo1.xls') #保存文件
 
 
if __name__ == '__main__':
    # 讀取文件的行和列
    rows_numb=0
    cols_numb=6
    read_excel(rows_numb,cols_numb)
 
ChineseClean_demo2.py:
# -*- coding:utf-8 -*-
import xlrd
import xlwt
'''
python3.4
  
'''
# file 表示源文件名字,修改此處便可
file="./data/demo1.xls"
  
 
def read_excel(rows_numb,cols_numb):
 
    f = xlwt.Workbook() #建立工做簿
 
    '''
    建立第sheet:
    '''
    sheet1 = f.add_sheet(u'sheet2_1',cell_overwrite_ok=True) #建立sheet
    sheet2 = f.add_sheet(u'sheet2_2',cell_overwrite_ok=True) #建立sheet
    sheet3 = f.add_sheet(u'sheet2_3',cell_overwrite_ok=True) #建立sheet
    sheet4 = f.add_sheet(u'sheet2_4',cell_overwrite_ok=True) #建立sheet
    row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID']
 
    for i in range(0,len(row0)):
        sheet1.write(0,i,row0[i])
        sheet2.write(0,i,row0[i])
        sheet3.write(0,i,row0[i])
        sheet4.write(0,i,row0[i])
 
 
    # 打開文件
    workbook = xlrd.open_workbook(file)
    sheet0 = workbook.sheet_by_index(0) # sheet索引從0開始
    cols = sheet0.col_values(cols_numb) # 獲取列內容
    rows_list_1=[]
    rows_list_2=[]
    rows_list_3=[]
    rows_list_4=[]  
    for i in range(1,len(cols)):
 
        if float(cols[i]) < 12.0:
            rows_list_1.append(i)
        if float(cols[i]) >= 12.0 and float(cols[i]) < 16.0:
            rows_list_2.append(i)
        if float(cols[i]) >= 16.0 and float(cols[i]) < 18.0:
            rows_list_3.append(i)
        if float(cols[i]) >= 18.0:
            print(i)
            print(type(cols[i]))
            exit()
            rows_list_4.append(i)
 
    '''
    sheet2_1保存差,小於12分
 
    '''
    for j in range(0,len(rows_list_1)):
        rows = sheet0.row_values(rows_list_1[j]) # 獲取行內容
        for i in range(0,len(rows)):
            sheet1.write(j+1,i,rows[i])
    '''
    sheet2_2保存中,大於等於12,且小於16分
 
 
    '''
 
    for j in range(0,len(rows_list_2)):
        rows = sheet0.row_values(rows_list_2[j]) # 獲取行內容
        for i in range(0,len(rows)):
            sheet2.write(j+1,i,rows[i])
 
    '''
    sheet2_3保存良,大於等於16,且小於18分
 
    '''
    for j in range(0,len(rows_list_3)):
        rows = sheet0.row_values(rows_list_3[j]) # 獲取行內容
        for i in range(0,len(rows)):
            sheet3.write(j+1,i,rows[i])
    '''
    sheet2_4保存優,大於等於18分
 
 
    '''
 
    for j in range(0,len(rows_list_4)):
        rows = sheet0.row_values(rows_list_4[j]) # 獲取行內容
        for i in range(0,len(rows)):
            sheet4.write(j+1,i,rows[i])
 
    f.save('./data/demo2.xls')
 
if __name__ == '__main__':
    # 讀取文件的行和列
    rows_numb=0
    cols_numb=6
    read_excel(rows_numb,cols_numb)
ChineseClean_demo3.py:
# -*- coding:utf-8 -*-
import xlrd
import xlwt
'''
python3.4
  
'''
file="./data/answer_detail_5_15307860968687.xls"
 
def read_excel(rows_numb,cols_numb):
 
    f = xlwt.Workbook() #建立工做簿
 
    '''
    建立第一個sheet:
    sheet1
    '''
    sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #建立sheet
    sheet2 = f.add_sheet(u'sheet2',cell_overwrite_ok=True) #建立sheet
    sheet3 = f.add_sheet(u'sheet3',cell_overwrite_ok=True) #建立sheet
    sheet4 = f.add_sheet(u'sheet4',cell_overwrite_ok=True) #建立sheet
    sheet5 = f.add_sheet(u'sheet5',cell_overwrite_ok=True)
    row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID']
 
    for i in range(0,len(row0)):
        sheet1.write(0,i,row0[i])
        sheet2.write(0,i,row0[i])
        sheet3.write(0,i,row0[i])
        sheet4.write(0,i,row0[i])
        sheet5.write(0,i,row0[i])
 
    # 打開文件
    workbook = xlrd.open_workbook(file)
    sheet0 = workbook.sheet_by_index(0) # sheet索引從0開始
    cols = sheet0.col_values(cols_numb) # 獲取列內容
    rows_list_1=[]
    rows_list_2=[]
    rows_list_3=[]
    rows_list_4=[]
    rows_list_5=[]  
    for i in range(1,len(cols)):
 
        if cols[i] == '100012':
            rows_list_1.append(i)
        if cols[i] == '100014':
            rows_list_2.append(i)
        if cols[i] == '100007':
            rows_list_3.append(i)
        if cols[i] == '100016':
            rows_list_4.append(i)
        if cols[i] == '100017':
            print(i)
            print(type(cols[i]))
            rows_list_5.append(i)
    '''
    sheet1保存
    '''
    for j in range(0,len(rows_list_1)):
        rows = sheet0.row_values(rows_list_1[j]) # 獲取第四行內容
        for i in range(0,len(rows)):
            sheet1.write(j+1,i,rows[i])
    '''
    sheet2保存
    '''
 
    for j in range(0,len(rows_list_2)):
        rows = sheet0.row_values(rows_list_2[j]) # 獲取第四行內容
        for i in range(0,len(rows)):
            sheet2.write(j+1,i,rows[i])
 
    '''
    sheet3保存
    '''
    for j in range(0,len(rows_list_3)):
        rows = sheet0.row_values(rows_list_3[j]) # 獲取第四行內容
        for i in range(0,len(rows)):
            sheet3.write(j+1,i,rows[i])
    '''
    sheet4保存
    '''
    for j in range(0,len(rows_list_4)):
        rows = sheet0.row_values(rows_list_4[j]) # 獲取第四行內容
        for i in range(0,len(rows)):
            sheet4.write(j+1,i,rows[i])
 
    '''
    sheet5保存
    '''
    for j in range(0,len(rows_list_5)):
        rows = sheet0.row_values(rows_list_5[j]) # 獲取第四行內容
        for i in range(0,len(rows)):
            sheet5.write(j+1,i,rows[i])
 
    f.save('./data/demo3.xls') #保存文件
 
if __name__ == '__main__':
    # 讀取文件的行和列
    rows_numb=0
    cols_numb=7
    read_excel(rows_numb,cols_numb)
ChineseClean_demo4or5.py:
同ChineseClean_demo3.py
ChineseClean_answer_QA.py:
# -*- coding:utf-8 -*-
import re
import xlrd
file="./data/demo5.xls"
dirs="./result"
 
def read_excel(rows_numb,cols1_numb):
    number='1'
    f2 = open(dirs+'./demo5_sheet1_%s.csv'%number, 'a', encoding='utf-8')
    # 打開文件
    workbook = xlrd.open_workbook(file)
    sheet0 = workbook.sheet_by_index(int(number)-1) # sheet索引從0開始
    cols1 = sheet0.col_values(cols1_numb[3]) [1:]# 獲取列內容
 
    p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"
    pattern1 = re.compile(p1)
    for i in range(len(cols1)):
        matcher1 = re.findall(pattern1, cols1[i])
        str1=str()
        if matcher1:
            str1 = ' '.join(matcher1)
            f2.write(str1)
        f2.write('\n')
 
    f2.close()
 
if __name__ == '__main__':
    # 讀取文件的行和列
    rows_numb=0
    cols1_numb=[0,1,2,3,4,5,6,7]
    read_excel(rows_numb,cols1_numb)
 
qa_test_clean_word.py:
# -*- coding: utf-8 -*-
 
import jieba
# 建立停用詞list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords
  
  
# 對句子進行分詞
def seg_sentence(sentence):
    sentence_seged = jieba.lcut_for_search(sentence.strip(),HMM=True)
    stopwords = stopwordslist('./test/stopwords.txt')  # 這裏加載停用詞的路徑
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return(outstr)
 
inputs = open('./data/demo5_answer_csv/demo5_sheet5_5.csv', 'r', encoding='utf-8')
outputs = open('./test/demo5_sheet5_5_5.csv', 'w')
for line in inputs:
    line_seg = seg_sentence(line) 
    try:
        if len(line_seg):
            outputs.write(line_seg + '\n')
    except:
        pass
     
outputs.close()
inputs.close()
word_fre.py:
# -*- coding: utf-8 -*-
 
 
import matplotlib.pyplot as plt
from matplotlib.font_manager import *
import numpy as np
 
def drawStatBarh():
    '''
    畫出詞頻統計條形圖,用漸變顏色顯示,選取前N個詞頻
    '''
    fig, ax = plt.subplots()
    myfont = FontProperties(fname='./data/simfang.ttf')
    N = 30
    words = []
    counts = []
    for line in open('./data/word_fre.txt'):
        if line == '\n':
            continue
        line.strip('\n')
 
        words.append(line.split(' ')[0])
        print(line.split(' ')[0])
        # exit()
        counts.append(int(line.split(' ')[1].strip('\n')))
 
    y_pos = np.arange(N)
 
    colors = ['#FA8072'] #這裏是爲了實現條狀的漸變效果,以該色號爲基本色實現漸變效果
    for i in range(len(words[:N]) - 1):
        colors.append('#FA' + str(int(colors[-1][3:]) - 1))
 
    rects = ax.barh(y_pos, counts[:N], align='center', color=colors)
 
    ax.set_yticks(np.arange(N))
    ax.set_yticklabels(words[:N],fontproperties=myfont)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_title('報告中的高頻詞彙',fontproperties=myfont, fontsize=17)
    ax.set_xlabel(u"出現次數",fontproperties=myfont)
 
    autolabel(rects, ax)
    plt.show()
 
 
def autolabel(rects, ax):
    """
    給條形圖加上文字標籤
    """
    #fig, ax = plt.subplots()
    for rect in rects:
        width = rect.get_width()
        ax.text(1.03 * width, rect.get_y() + rect.get_height()/2., 
            '%d' % int(width),ha='center', va='center')
 
 
def wordCount(segment_list):
    '''
        該函數實現詞頻的統計,並將統計結果存儲至本地。
        在製做詞雲的過程當中用不到,主要是在畫詞頻統計圖時用到。
    '''
    word_lst = []
    word_dict = {}
    with open('./data/word_fre.txt','w') as wf2:
        word_lst.append(segment_list.split(' '))
        for item in word_lst:
            for item2 in item:
                if item2 not in word_dict:
                    word_dict[item2] = 1
                else:
                    word_dict[item2] += 1
        # print(type(word_dict))
        # print(word_dict)
        word_dict_sorted =list(sorted(word_dict.items(),key = lambda jj:jj[1],reverse=True))#list是關鍵,按照詞頻從大到小排序
        # word_dict_sorted = dict(sorted(word_dict.items(),key = lambda item:item[1], reverse=True))#按照詞頻從大到小排序
        print(word_dict_sorted)
        # exit()
        for tup in word_dict_sorted:
            # print(type(tup))
            # print(tup)
            # exit()
            if tup[0] != '':
                wf2.write(tup[0].strip('\n')+' '+str(tup[1])+'\n')
    wf2.close()
 
 
 
 
if __name__ == "__main__":
    segment_list_remove_stopwords=open('./data/demo5_sheet5_1_1.csv').read()
    wordCount(segment_list_remove_stopwords)
    drawStatBarh()
wordcloud_test2.py:
# - * - coding: utf - 8 -*-
 
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
# jieba.load_userdict("txt\userdict.txt")
# 添加用戶詞庫爲主詞典,原詞典變爲非主詞典
from wordcloud import WordCloud, ImageColorGenerator
 
# 獲取當前文件路徑
# __file__ 爲當前文件, 在ide中運行此行會報錯,可改成
# d = path.dirname('.')
d = path.dirname(__file__)
 
stopwords = {}
isCN = 1 #默認啓用中文分詞
back_coloring_path = "data/lz1.jpg" # 設置背景圖片路徑
text_path = 'data/demo5_sheet5_1_1.csv' #設置要分析的文本路徑,講原始文件轉化爲‘ANSI編碼便可’
font_path = 'data/simfang.ttf' # 爲matplotlib設置中文字體路徑
stopwords_path = 'data/stopwords.txt' # 停用詞詞表
imgname1 = "data/WordCloudDefautColors.png" # 保存的圖片名字1(只按照背景圖片形狀)
imgname2 = "data/WordCloudColorsByImg.png"# 保存的圖片名字2(顏色按照背景圖片顏色佈局生成)
 
# my_words_list = ['CHENGLEI'] # 在結巴的詞庫中添加新詞
 
back_coloring = imread(path.join(d, back_coloring_path))# 設置背景圖片
 
# 設置詞雲屬性
wc = WordCloud(font_path=font_path,  # 設置字體
               background_color="white",  # 背景顏色
               max_words=2000,  # 詞雲顯示的最大詞數
               mask=back_coloring,  # 設置背景圖片
               max_font_size=100,  # 字體最大值
               random_state=42,
               width=1000, height=860, margin=2,# 設置圖片默認的大小,可是若是使用背景圖片的話,那麼保存的圖片大小將會按照其大小保存,margin爲詞語邊緣距離
               )
 
# 添加本身的詞庫分詞
# def add_word(list):
#     for items in list:
#         jieba.add_word(items)
 
# add_word(my_words_list)
 
text = open(path.join(d, text_path)).read()
 
# def jiebaclearText(text):
#     mywordlist = []
#     seg_list = jieba.cut(text, cut_all=False)
#     liststr="/ ".join(seg_list)
#     f_stop = open(stopwords_path)
#     try:
#         f_stop_text = f_stop.read( )
#         f_stop_text=unicode(f_stop_text,'utf-8')
#     finally:
#         f_stop.close( )
#     f_stop_seg_list=f_stop_text.split('\n')
#     for myword in liststr.split('/'):
#         if not(myword.strip() in f_stop_seg_list) and len(myword.strip())>1:
#             mywordlist.append(myword)
#     return ''.join(mywordlist)
#
# if isCN:
#     text = jiebaclearText(text)
 
# 生成詞雲, 能夠用generate輸入所有文本(wordcloud對中文分詞支持很差,建議啓用中文分詞),也能夠咱們計算好詞頻後使用generate_from_frequencies函數
wc.generate(text)
# wc.generate_from_frequencies(text)
# txt_freq例子爲[('詞a', 100),('詞b', 90),('詞c', 80)]
# 從背景圖片生成顏色值
image_colors = ImageColorGenerator(back_coloring)
 
plt.figure()
# 如下代碼顯示圖片
plt.imshow(wc)
plt.axis("off")
plt.show()
# 繪製詞雲
 
# 保存圖片
wc.to_file(path.join(d, imgname1))
 
image_colors = ImageColorGenerator(back_coloring)
 
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
# 繪製背景圖片爲顏色的圖片
plt.figure()
plt.imshow(back_coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
# 保存圖片
wc.to_file(path.join(d, imgname2))
 
lda_test_ok.py:
# coding=utf-8        
 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import lda
 
def doc_topic_word():
    print(doc_topic[:, :3])#輸出文檔主題分佈狀況(前3列)
    print(topic_word[:, :3])#輸出主題詞分佈狀況(前3列),採用ifidf計算詞頻
 
#導出分佈圖
def plot_1():   
    # 計算各個主題中單詞權重分佈的狀況
    f, ax= plt.subplots(2, 1, figsize=(6, 6), sharex=True) 
    for i, k in enumerate([0, 9]):         #任意選擇兩個主題
        ax[i].stem(topic_word[k,:], linefmt='b-', 
                   markerfmt='bo', basefmt='w-') 
        ax[i].set_xlim(-2,2000) 
        ax[i].set_ylim(0, 1) 
        ax[i].set_ylabel("Prob") 
        ax[i].set_title("topic {}".format(k)) 
       
    ax[1].set_xlabel("word")
    plt.tight_layout() 
    plt.show()
 
def plot_2():
    # 計算文檔具體分佈在那個主題,代碼以下所示:
      
    f, ax= plt.subplots(2, 1, figsize=(8, 8), sharex=True) 
    for i, k in enumerate([0,9]):  #任意選擇兩個主題
        ax[i].stem(doc_topic[k,:], linefmt='r-', 
                   markerfmt='ro', basefmt='w-') 
        ax[i].set_xlim(-1, 20)     #x座標下標,即主題的取值範圍
        ax[i].set_ylim(0, 1.2)    #y座標下標
        ax[i].set_ylabel("Prob") 
        ax[i].set_title("Document {}".format(k)) 
    ax[1].set_xlabel("Topic")
    plt.tight_layout()
    plt.show() 
 
  
if __name__ == "__main__":
  
  
    #存儲讀取語料 一行預料爲一個文檔
    corpus = []
    for line in open('./data/demo5_sheet5_1_1.csv', 'r').readlines():
        corpus.append(line.strip())
 
    #將文本中的詞語轉換爲詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文本下的詞頻
    vectorizer = CountVectorizer()
    print (vectorizer)
 
    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    weight = X.toarray()
    print("type(X): {}".format(type(X))) 
    print("shape: {}\n".format(X.shape))
    print (len(weight))
    print (weight[:5, :5])
 
    #LDA算法
    print ('LDA:')
    model = lda.LDA(n_topics=20, n_iter=50, random_state=1)
    # model.fit_transform(X)
    model.fit(np.asarray(weight))     # model.fit_transform(X) is also available?
    topic_word = model.topic_word_    # model.components_ also works
 
    #文檔-主題(Document-Topic)分佈
    doc_topic = model.doc_topic_
    print("type(doc_topic): {}".format(type(doc_topic)))
    print("shape: {}".format(doc_topic.shape))
 
    #輸出前10篇文章最可能的Topic
    label = []     
    for n in range(10):
        topic_most_pr = doc_topic[n].argmax()
        label.append(topic_most_pr)
        print("doc: {} topic: {}".format(n, topic_most_pr))
 
    #輸出主題中的TopN關鍵詞
    word = vectorizer.get_feature_names()
    n = 6 
    for i, topic_dist in enumerate(topic_word): 
        topic_words = np.array(word)[np.argsort(topic_dist)][:-(n+1):-1] 
        print(u'*Topic {}\n- {}'.format(i, ' '.join(topic_words))) 
         
    # doc_topic_word()
    # plot_1()
    plot_2()
 
gensimTopicTest0803.py:
# coding=utf-8        
import re
import xlrd
import codecs
import jieba
from gensim import corpora, models, similarities
 
FILE="demo5"#選擇要訓練的文件
ID='1'#選擇要訓練的能力ID
 
# 讀取停用詞表
stopwords = [line.strip() for line in codecs.open('./data/stopwords.txt', 'r', encoding='utf-8').readlines()]
 
def cleanAnswer(cols_numb):
 
    f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8')
    # 打開文件
    workbook = xlrd.open_workbook('./data/%s.xls'%FILE)
    #根據sheet索引或者名稱獲取sheet內容
    sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引從0開始
    cols1 = sheet0.col_values(cols_numb[3])[1:]# 獲取第三列內容,從第一行開始
 
    p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不獲取匹配,即不獲取括號內的匹配,括號內使用UNICODE編碼匹配
    pattern1 = re.compile(p1)
    for i in range(len(cols1)):
        matcher1 = re.findall(pattern1, cols1[i])#以列表形式返回全部能匹配到的子串
        str1=str()
        if matcher1:
            str1 =''.join(matcher1)
            f1.write(str1.strip())
        f1.write('\n')
    f1.close()
 
def ldaAnaly():
 
 
    print("構造分詞庫-----train-----")
    #去停用詞,構建分詞庫
    train = []
    fp = codecs.open('./result/%s_sheet%s.csv'%(FILE,ID),'r',encoding='utf8')
    for line in fp.readlines():
        line = line.strip()
        if not len(line):#判斷是否爲空行
            continue
        outstr = ' '
        seg_list =jieba.cut(line,cut_all=False)#採用精確模式分詞,效果最好
        for word in seg_list:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        train.append(outstr.strip().split(" "))#字符串轉列表
    fp.close()
 
    print("構造分詞庫,並保存----「dict_v1.dict」----")
    dic = corpora.Dictionary(train)
    dic.save('./result/dict_v1.dict')
 
    print("保存可讀取的分詞庫----「dic.csv」----")
    fd = codecs.open('./result/dic.csv', 'a',encoding = 'utf-8')
    for word,index in dic.token2id.items():
        fd.write(word +':'+ str(index)+'\n')
    fd.close()
 
    print("生成語料庫,並保存-----「corpus.mm」-----")
    corpus = [dic.doc2bow(text) for text in train]
    corpora.MmCorpus.serialize('./result/corpus.mm', corpus)
 
    print("保存tfidf模型-----「corpus.tfidf_model」-----")
    tfidf = models.TfidfModel(corpus)
    tfidf.save('./result/corpus.tfidf_model')
 
    print("進行LDA主題分析,並保存-----「ldaModel.pkl」-----")
    #使用tf-idf模型訓練語料庫
    corpus_tfidf = tfidf[corpus]
    #設置100個LDA主題,使用500次迭代
    lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=100, iterations=500 )
    lda.save('./result/ldaModel.pkl')
 
    print("評估文章屬於不一樣主題的機率,一個詞對文章的重要性-----「Demo:評估文章1」-----")
    for index, score in sorted(lda[corpus_tfidf[0]], key=lambda tup: -1 * tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 10)))
 
    # 輸出100個主題
    # ldaOut = lda.print_topics(100)
    # print("默認返回每一個主題的前10的機率最大的詞")
    # print (ldaOut[0])
    # print (ldaOut[1])
    # print (ldaOut[2])
    # corpus_lda = lda[corpus_tfidf]
    # print("每篇文章屬於不一樣主題的機率分佈")
    # k = 0
    # for doc in corpus_lda:
    #     print(doc)
    #     k += 1
    #     if k == 3:
    #         break
 
 
def questionAnswer(cols_numb, questionNumber):
    lda = models.LdaModel.load('./result/ldaModel.pkl')
    dic = corpora.Dictionary.load('./result/dict_v1.dict')
    corpus = corpora.MmCorpus('./result/corpus.mm')
    tfidf = models.TfidfModel.load('./result/corpus.tfidf_model')
 
    # print("輸入一個問題------------------")
    f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8')
    # 打開文件
    workbook = xlrd.open_workbook('./data/%s.xls'%FILE)
    sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引從0開始
    cols0 = sheet0.col_values(cols_numb[3])[questionNumber]  # 獲取第三列內容,從第一行開始
 
    #對問題進行去亂碼
    p1 = r"(?:[\u2E80-\uFFFD]|[\u201c-\u201d]|[\u002d]|[\u003a])+"#(?:)不獲取匹配,即不獲取括號內的匹配,括號內使用UNICODE編碼匹配
    pattern1 = re.compile(p1)
    matcher1 = re.findall(pattern1, cols0)#以列表形式返回全部能匹配到的子串
    query=str()
    if matcher1:
        query =''.join(matcher1)
    # print("待預測的問題(去亂碼):", query)
 
    #對問題進行分詞
    seg_list = jieba.cut(query, cut_all=False)
    outstr = ' '
    for word in seg_list:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    inputTest=list(outstr.strip().split(" "))
    # print("分詞後的問題(去停用詞):", inputTest)
 
    #將問題轉成詞袋
    query_bow = dic.doc2bow(inputTest)
    # print("生成的詞袋:", query_bow)
 
    #須要對查詢語句進行tfidf轉化
    query_tfidf = tfidf[query_bow]
    lda_vec_tfidf = lda[query_tfidf]
    # print("問題對應的主題機率(tfidf)", lda_vec_tfidf)
 
    # print("預測問題屬於不一樣主題的機率--------------------")
    #輸出主題機率的代碼
    # for index, score in sorted(lda_vec_tfidf, key=lambda tup: -1 * tup[1]):
    #     print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 20)))
 
 
    # print("預測問題與數據庫中的哪些問題類似,並給出類似度排序(tfidf)--------------------")
    #進行類似性檢索
    similarity = similarities.MatrixSimilarity(corpus)
 
    #在TFIDF的基礎上,進行類似性檢測。query_lsi須要進行預先處理。先變化爲dow2bow,而後tfidf.
    lda_vec = lda[query_bow]
 
    # sims = similarity[lda_vec] #類似度檢測的詞袋爲no-tfidf
    sims = similarity[lda_vec_tfidf] #類似度檢測的詞袋爲tfidf
 
    #先枚舉出來,後進行排序輸出
    listSims = enumerate(sims)
    sort_sims = sorted(listSims, key=lambda item: -item[1])
    # print(sort_sims[0:6])#前n名效果最好
 
 
    #進行分數預測--版本1---
    sort_sims_list = sort_sims[0:6]
    cols1 = sheet0.col_values(cols_numb[6])[1:]# 獲取第三列內容,從第一行開始
    f1.close()
 
    #採用百分比形式的加權平均法,實質就是加權平均誤差法
    sumCore1 = 0
    sumPro = 0
    for i in range(len(sort_sims_list)):
        sumCore1 += float(cols1[sort_sims_list[i][0] - 1]) * sort_sims_list[i][1]
        # print(cols1[sort_sims_list[i][0] - 1])
        sumPro += sort_sims_list[i][1]
 
    preCore1 = sumCore1 / sumPro
    # print("採用加權平均誤差法,預測分數1爲:%s,實際分數爲%s"%(preCore1, cols1[questionNumber-1]))
 
    print("保存預測結果----「pre.csv」----")
    return preCore1, cols1[questionNumber-1], abs(preCore1 - float(cols1[questionNumber-1]))
 
if __name__ == '__main__':
 
    cols_numb = [0,1,2,3,4,5,6,7] #讀取文件的列標號
    # questionNumber = 124 #待測試的問題號,最大不超過問題總數,主要用於測試
    # cleanAnswer(cols_numb) #對數據庫中的問題進行提取,並去亂碼
    # ldaAnaly() #對問題進行訓練,生成主題模型
    # questionAnswer(cols_numb, questionNumber) #對問題進行預測,給出預測分數
 
    #循環預測的demo
    fp = codecs.open('./result/pre_v1.csv', 'a', encoding='utf-8')
    sum = 0
    i = 1
    count = 0
    while( i < 8717 ):
        questionNumber = i
        a = questionAnswer(cols_numb, questionNumber)
        sum += a[2]
        # print(a, a[2])
        # exit()
        i += 8
        count += 1
        fp.write(str(i)+":"+str(a) + '\n')
    fp.close()
 
    ave = sum / count
    print(ave)
相關文章
相關標籤/搜索