nltk同時也能處理中文的場景,只要作以下改動:
html
使用中文分詞器(如我選用告終巴分詞)python
對中文字符作編碼處理,使用unicode編碼方式app
python的源碼編碼統一聲明爲 gbkdom
使用支持中文的語料庫ide
代碼以下,須要jieba的支持this
#!/usr/bin/env python #-*-coding=gbk-*- """ 原始數據,用於創建模型 """ #縮水版的courses,實際數據的格式應該爲 課程名\t課程簡介\t課程詳情,並已去除html等干擾因素 courses = [ u'Writing II: Rhetorical Composing', u'Genetics and Society: A Course for Educators', u'General Game Playing', u'Genes and the Human Condition (From Behavior to Biotechnology)', u'A Brief History of Humankind', u'New Models of Business in Society', u'Analyse Numrique pour Ingnieurs', u'Evolution: A Course for Educators', u'Coding the Matrix: Linear Algebra through Computer Science Applications', u'The Dynamic Earth: A Course for Educators', u'Tiny Wings\tYou have always dreamed of flying - but your wings are tiny. Luckily the world is full of beautiful hills. Use the hills as jumps - slide down, flap your wings and fly! At least for a moment - until this annoying gravity brings you back down to earth. But the next hill is waiting for you already. Watch out for the night and fly as fast as you can. ', u'Angry Birds Free', u'沒有\它很類似', u'沒有\t它很類似', u'沒有\t他很類似', u'沒有\t他不很類似', u'沒有', u'能夠沒有', u'也沒有', u'有沒有也無論', u'Angry Birds Stella', u'Flappy Wings - FREE\tFly into freedom!A parody of the #1 smash hit game!', u'沒有一個', u'沒有一個2', ] #只是爲了最後的查看方便 #實際的 courses_name = [course.split('\t')[0] for course in courses] courses_name = courses """ 預處理(easy_install nltk) """ def pre_process_cn(courses, low_freq_filter = True): """ 簡化的 中文+英文 預處理 1.去掉停用詞 2.去掉標點符號 3.處理爲詞幹 4.去掉低頻詞 """ import nltk import jieba.analyse from nltk.tokenize import word_tokenize texts_tokenized = [] for document in courses: texts_tokenized_tmp = [] for word in word_tokenize(document): texts_tokenized_tmp += jieba.analyse.extract_tags(word,10) texts_tokenized.append(texts_tokenized_tmp) texts_filtered_stopwords = texts_tokenized #去除標點符號 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] #詞幹化 from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered] #去除太低頻詞 if low_freq_filter: all_stems = sum(texts_stemmed, []) stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] else: texts = texts_stemmed return texts lib_texts = pre_process_cn(courses) """ 引入gensim,正式開始處理(easy_install gensim) """ def train_by_lsi(lib_texts): """ 經過LSI模型的訓練 """ from gensim import corpora, models, similarities #爲了能看到過程日誌 #import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): 將collection words 轉爲詞袋,用兩元組(word_id, word_frequency)表示 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #拍腦殼的:訓練topic數量爲10的LSI模型 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) # index 是 gensim.similarities.docsim.MatrixSimilarity 實例 return (index, dictionary, lsi) #庫創建完成 -- 這部分可能數據很大,能夠預先處理好,存儲起來 (index,dictionary,lsi) = train_by_lsi(lib_texts) #要處理的對象登場 target_courses = [u'沒有'] target_text = pre_process_cn(target_courses, low_freq_filter=False) """ 對具體對象類似度匹配 """ #選擇一個基準數據 ml_course = target_text[0] #詞袋處理 ml_bow = dictionary.doc2bow(ml_course) #在上面選擇的模型數據 lsi 中,計算其餘數據與其的類似度 ml_lsi = lsi[ml_bow] #ml_lsi 形式如 (topic_id, topic_value) sims = index[ml_lsi] #sims 是最終結果了, index[xxx] 調用內置方法 __getitem__() 來計算ml_lsi #排序,爲輸出方便 sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) #查看結果 print sort_sims[0:10] #看下前10個最類似的,第一個是基準數據自身 print courses_name[sort_sims[1][0]] #看下實際最類似的數據叫什麼 print courses_name[sort_sims[2][0]] #看下實際最類似的數據叫什麼 print courses_name[sort_sims[3][0]] #看下實際最類似的數據叫什麼