Python機器學習(4)——基於k-means和tfidf的文本聚類分析

時間 2020-06-29

標籤 python 機器學習基於 means tfidf 文本分析欄目 Python 简体版

原文原文鏈接

基本步驟包括：
1.使用python+selenium分析dom結構爬取百度|互動百科文本摘要信息；
2.使用jieba結巴分詞對文本進行中文分詞，同時插入字典關於關鍵詞；
3.scikit-learn對文本內容進行tfidf計算並構造N*M矩陣(N個文檔 M個特徵詞)；
4.再使用K-means進行文本聚類(省略特徵詞過來降維過程);
5.最後對聚類的結果進行簡單的文本處理，按類簇歸類，也能夠計算P/R/F特徵值；
6.總結這篇論文及K-means的缺點及知識圖譜的一些內容。

固然這只是一篇最最基礎的文章，更高深的分類、聚類、LDA、SVM、隨機森林等內容，本身之後慢慢學習吧！這篇做爲在線筆記，路漫漫其修遠兮，fighting~

html

一. 爬蟲實現

爬蟲主要經過Python+Selenium+Phantomjs實現，爬取百度百科和互動百科旅遊景點信息，其中爬取百度百科代碼以下。
參考前文：[Python爬蟲] Selenium獲取百度百科旅遊景點的InfoBox消息盒

實現原理：
首先從Tourist_spots_5A_BD.txt中讀取景點信息，而後經過調用無界面瀏覽器PhantomJS（Firefox可替代）訪問百度百科連接"http://baike.baidu.com/"，經過Selenium獲取輸入對話框ID，輸入關鍵詞如"故宮"，再訪問該百科頁面。最後經過分析DOM樹結構獲取摘要的ID並獲取其值。核心代碼以下：
driver.find_elements_by_xpath("//div[@class='lemma-summary']/div")

PS：Selenium更多應用於自動化測試，推薦Python爬蟲使用scrapy等開源工具。python

# coding=utf-8  
""" 
Created on 2015-09-04 @author: Eastmount  
"""  
  
import time          
import re          
import os  
import sys
import codecs
import shutil
from selenium import webdriver      
from selenium.webdriver.common.keys import Keys      
import selenium.webdriver.support.ui as ui      
from selenium.webdriver.common.action_chains import ActionChains  
  
#Open PhantomJS  
driver = webdriver.PhantomJS(executable_path="G:\phantomjs-1.9.1-windows\phantomjs.exe")  
#driver = webdriver.Firefox()  
wait = ui.WebDriverWait(driver,10)
 
#Get the Content of 5A tourist spots  
def getInfobox(entityName, fileName):  
    try:  
        #create paths and txt files
        print u'文件名稱: ', fileName
        info = codecs.open(fileName, 'w', 'utf-8')  
 
        #locate input  notice: 1.visit url by unicode 2.write files
        #Error: Message: Element not found in the cache -
        #       Perhaps the page has changed since it was looked up
        #解決方法: 使用Selenium和Phantomjs
        print u'實體名稱: ', entityName.rstrip('\n') 
        driver.get("http://baike.baidu.com/")  
        elem_inp = driver.find_element_by_xpath("//form[@id='searchForm']/input")  
        elem_inp.send_keys(entityName)  
        elem_inp.send_keys(Keys.RETURN)  
        info.write(entityName.rstrip('\n')+'\r\n')  #codecs不支持'\n'換行
        time.sleep(2)  
  
        #load content 摘要
        elem_value = driver.find_elements_by_xpath("//div[@class='lemma-summary']/div")
        for value in elem_value:
            print value.text
            info.writelines(value.text + '\r\n')
        time.sleep(2)  
          
    except Exception,e:    #'utf8' codec can't decode byte  
        print "Error: ",e  
    finally:  
        print '\n'  
        info.close() 
  
#Main function  
def main():
    #By function get information
    path = "BaiduSpider\\"
    if os.path.isdir(path):
        shutil.rmtree(path, True)
    os.makedirs(path)
    source = open("Tourist_spots_5A_BD.txt", 'r')
    num = 1
    for entityName in source:  
        entityName = unicode(entityName, "utf-8")  
        if u'故宮' in entityName:   #else add a '?'  
            entityName = u'北京故宮'
        name = "%04d" % num
        fileName = path + str(name) + ".txt"
        getInfobox(entityName, fileName)
        num = num + 1
    print 'End Read Files!'  
    source.close()  
    driver.close()
    
if __name__ == '__main__':
    main()

運行結果以下圖所示：web

二. 中文分詞

中文分詞主要使用的是Python+Jieba分詞工具，同時導入自定義詞典dict_baidu.txt，裏面主要是一些專業景點名詞，如"黔清宮"分詞"黔/清宮"，若是詞典中存在專有名詞"乾清宮"就會先查找詞典。算法

結巴中文分詞涉及到的算法包括：
(1) 基於Trie樹結構實現高效的詞圖掃描，生成句子中漢字全部可能成詞狀況所構成的有向無環圖（DAG)；
(2) 採用了動態規劃查找最大機率路徑, 找出基於詞頻的最大切分組合；
(3) 對於未登陸詞，採用了基於漢字成詞能力的HMM模型，使用了Viterbi算法。

結巴中文分詞支持的三種分詞模式包括：
(1) 精確模式：試圖將句子最精確地切開，適合文本分析；
(2) 全模式：把句子中全部的能夠成詞的詞語都掃描出來, 速度很是快，可是不能解決歧義問題；
(3) 搜索引擎模式：在精確模式的基礎上，對長詞再次切分，提升召回率，適合用於搜索引擎分詞。
同時結巴分詞支持繁體分詞和自定義字典方法。
參考前文：[python] 使用Jieba工具中文分詞及文本聚類概念windows

#encoding=utf-8
import sys
import re
import codecs
import os
import shutil
import jieba
import jieba.analyse
 
#導入自定義詞典
jieba.load_userdict("dict_baidu.txt")
 
#Read file and cut
def read_file_cut():
    #create path
    path = "BaiduSpider\\"
    respath = "BaiduSpider_Result\\"
    if os.path.isdir(respath):
        shutil.rmtree(respath, True)
    os.makedirs(respath)
 
    num = 1
    while num<=204:
        name = "%04d" % num 
        fileName = path + str(name) + ".txt"
        resName = respath + str(name) + ".txt"
        source = open(fileName, 'r')
        if os.path.exists(resName):
            os.remove(resName)
        result = codecs.open(resName, 'w', 'utf-8')
        line = source.readline()
        line = line.rstrip('\n')
        
        while line!="":
            line = unicode(line, "utf-8")
            seglist = jieba.cut(line,cut_all=False)  #精確模式
            output = ' '.join(list(seglist))         #空格拼接
            print output
            result.write(output + '\r\n')
            line = source.readline()
        else:
            print 'End file: ' + str(num)
            source.close()
            result.close()
        num = num + 1
    else:
        print 'End All'
 
#Run function
if __name__ == '__main__':
    read_file_cut()

按照Jieba精確模式分詞且空格拼接，"0003.txt 頤和園"分詞結果以下圖所示：數組

爲方便後面的計算或對接一些sklearn或w2v等工具，下面這段代碼將結果存儲在同一個txt中，每行表示一個景點的分詞結果。瀏覽器

# coding=utf-8            
import re          
import os  
import sys
import codecs
import shutil
 
def merge_file():
    path = "BaiduSpider_Result\\"
    resName = "BaiduSpider_Result.txt"
    if os.path.exists(resName):
        os.remove(resName)
    result = codecs.open(resName, 'w', 'utf-8')
 
    num = 1
    while num <= 204:
        name = "%04d" % num 
        fileName = path + str(name) + ".txt"
        source = open(fileName, 'r')
        line = source.readline()
        line = line.strip('\n')
        line = line.strip('\r')
 
        while line!="":
            line = unicode(line, "utf-8")
            line = line.replace('\n',' ')
            line = line.replace('\r',' ')
            result.write(line+ ' ')
            line = source.readline()
        else:
            print 'End file: ' + str(num)
            result.write('\r\n')
            source.close()
        num = num + 1
        
    else:
        print 'End All'
        result.close()    
 
if __name__ == '__main__':
    merge_file()

每行一個景點的分詞結果，運行結果以下圖所示：

app

三. 計算TF-IDF

此時，須要將文檔類似度問題轉換爲數學向量矩陣問題，能夠經過VSM向量空間模型來存儲每一個文檔的詞頻和權重，特徵抽取完後，由於每一個詞語對實體的貢獻度不一樣，因此須要對這些詞語賦予不一樣的權重。計算詞項在向量中的權重方法——TF-IDF。

相關介紹：
它表示TF（詞頻）和IDF（倒文檔頻率）的乘積：dom

其中TF表示某個關鍵詞出現的頻率，IDF爲全部文檔的數目除以包含該詞語的文檔數目的對數值。機器學習

|D|表示全部文檔的數目，|w∈d|表示包含詞語w的文檔數目。
最後TF-IDF計算權重越大表示該詞條對這個文本的重要性越大，它的目的是去除一些"的、了、等"出現頻率較高的經常使用詞。

參考前文：Python簡單實現基於VSM的餘弦類似度計算
 基於VSM的命名實體識別、歧義消解和指代消解

下面是使用scikit-learn工具調用CountVectorizer()和TfidfTransformer()函數計算TF-IDF值，同時後面"四.K-means聚類"代碼也包含了這部分，該部分代碼先提出來介紹。

# coding=utf-8  
""" 
Created on 2015-12-30 @author: Eastmount  
"""  
  
import time          
import re          
import os  
import sys
import codecs
import shutil
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer
 
'''
sklearn裏面的TF-IDF主要用到了兩個函數：CountVectorizer()和TfidfTransformer()。
    CountVectorizer是經過fit_transform函數將文本中的詞語轉換爲詞頻矩陣。
    矩陣元素weight[i][j] 表示j詞在第i個文本下的詞頻，即各個詞語出現的次數。
    經過get_feature_names()可看到全部文本的關鍵字，經過toarray()可看到詞頻矩陣的結果。
    TfidfTransformer也有個fit_transform函數，它的做用是計算tf-idf值。
'''
 
if __name__ == "__main__":
    corpus = [] #文檔預料 空格鏈接
 
    #讀取預料 一行預料爲一個文檔
    for line in open('BaiduSpider_Result.txt', 'r').readlines():
        print line
        corpus.append(line.strip())
    #print corpus
    time.sleep(5)
    
    #將文本中的詞語轉換爲詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文本下的詞頻
    vectorizer = CountVectorizer()
 
    #該類會統計每一個詞語的tf-idf權值
    transformer = TfidfTransformer()
 
    #第一個fit_transform是計算tf-idf 第二個fit_transform是將文本轉爲詞頻矩陣
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
 
    #獲取詞袋模型中的全部詞語  
    word = vectorizer.get_feature_names()
 
    #將tf-idf矩陣抽取出來，元素w[i][j]表示j詞在i類文本中的tf-idf權重
    weight = tfidf.toarray()
 
    resName = "BaiduTfidf_Result.txt"
    result = codecs.open(resName, 'w', 'utf-8')
    for j in range(len(word)):
        result.write(word[j] + ' ')
    result.write('\r\n\r\n')
 
    #打印每類文本的tf-idf詞語權重，第一個for遍歷全部文本，第二個for便利某一類文本下的詞語權重  
    for i in range(len(weight)):
        print u"-------這裏輸出第",i,u"類文本的詞語tf-idf權重------"  
        for j in range(len(word)):
            result.write(str(weight[i][j]) + ' ')
        result.write('\r\n\r\n')
 
    result.close()

其中輸出以下所示，因爲文本摘要很少，總共8368維特徵，其中共400個景點（百度百科200 互動百科200）文本摘要，故構建的矩陣就是[400][8368]，其中每一個景點都有對應的矩陣存儲TF-IDF值。

缺點：能夠嘗試出去一些停用詞、數字等，同時能夠若是文檔維數過多，能夠設置固定的維度，同時進行一些降維操做或構建稀疏矩陣，你們能夠本身去研究下。
推薦一些優秀的關於Sklearn工具TF-IDF的文章：
  python scikit-learn計算tf-idf詞語權重 - liuxuejiang158
  用Python開始機器學習（5：文本特徵抽取與向量化） - lsldd大神
   官方scikit-learn文檔 4.3. Preprocessing data

四. K-means聚類

其中K-means聚類算法代碼以下所示，主要是調用sklearn.cluster實現。
強推一些機器學習大神關於Scikit-learn工具的分類聚類文章，很是優秀：
  用Python開始機器學習（10：聚類算法之K均值） -lsldd大神
   應用scikit-learn作文本分類（特徵提取 KNN SVM 聚類） - Rachel-Zhang大神
   Scikit Learn: 在python中機器學習(KNN SVMs K均) - yyliu大神開源中國
 【機器學習實驗】scikit-learn的主要模塊和基本使用 - JasonDing大神
   Scikit-learn學習筆記中文簡介(P30-Cluster) - 百度文庫
   使用sklearn作kmeans聚類分析 - xiaolitnt
  使用sklearn + jieba中文分詞構建文本分類器 - MANYU GOU大神
   sklearn學習(1) 數據集（官方數據集使用） - yuanyu5237大神
   scikit-learn使用筆記與sign prediction簡單小結 - xupeizhi
  http://scikit-learn.org/stable/modules/clustering.html#clustering

代碼以下：

# coding=utf-8  
""" 
Created on 2016-01-06 @author: Eastmount  
"""  
  
import time          
import re          
import os  
import sys
import codecs
import shutil
import numpy as np
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  
 
if __name__ == "__main__":
    
    #########################################################################
    #                           第一步 計算TFIDF
    
    #文檔預料 空格鏈接
    corpus = []
    
    #讀取預料 一行預料爲一個文檔
    for line in open('BHSpider_Result.txt', 'r').readlines():
        print line
        corpus.append(line.strip())
    #print corpus
    #time.sleep(1)
    
    #將文本中的詞語轉換爲詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文本下的詞頻
    vectorizer = CountVectorizer()
 
    #該類會統計每一個詞語的tf-idf權值
    transformer = TfidfTransformer()
 
    #第一個fit_transform是計算tf-idf 第二個fit_transform是將文本轉爲詞頻矩陣
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
 
    #獲取詞袋模型中的全部詞語  
    word = vectorizer.get_feature_names()
 
    #將tf-idf矩陣抽取出來，元素w[i][j]表示j詞在i類文本中的tf-idf權重
    weight = tfidf.toarray()
 
    #打印特徵向量文本內容
    print 'Features length: ' + str(len(word))
    resName = "BHTfidf_Result.txt"
    result = codecs.open(resName, 'w', 'utf-8')
    for j in range(len(word)):
        result.write(word[j] + ' ')
    result.write('\r\n\r\n')
 
    #打印每類文本的tf-idf詞語權重，第一個for遍歷全部文本，第二個for便利某一類文本下的詞語權重  
    for i in range(len(weight)):
        print u"-------這裏輸出第",i,u"類文本的詞語tf-idf權重------"  
        for j in range(len(word)):
            #print weight[i][j],
            result.write(str(weight[i][j]) + ' ')
        result.write('\r\n\r\n')
 
    result.close()
 
 
    ########################################################################
    #                               第二步 聚類Kmeans
 
    print 'Start Kmeans:'
    from sklearn.cluster import KMeans
    clf = KMeans(n_clusters=20)
    s = clf.fit(weight)
    print s
 
    #20箇中心點
    print(clf.cluster_centers_)
    
    #每一個樣本所屬的簇
    print(clf.labels_)
    i = 1
    while i <= len(clf.labels_):
        print i, clf.labels_[i-1]
        i = i + 1
 
    #用來評估簇的個數是否合適，距離越小說明簇分的越好，選取臨界點的簇個數
    print(clf.inertia_)

輸出以下圖所示，20個類簇中心點和408個簇，對應408個景點，每一個文檔對應聚在相應的類0~19。

五. 結果處理

爲了更直觀的顯示結果，經過下面的程序對景點進行簡單結果處理。

# coding=utf-8  
import os  
import sys
import codecs
 
'''
@2016-01-07 By Eastmount
功能:合併實體名稱和聚類結果 共類簇20類
輸入:BH_EntityName.txt Cluster_Result.txt
輸出:ZBH_Cluster_Merge.txt ZBH_Cluster_Result.txt
'''
 
source1 = open("BH_EntityName.txt",'r')
source2 = open("Cluster_Result.txt",'r')
result1 = codecs.open("ZBH_Cluster_Result.txt", 'w', 'utf-8')
 
#########################################################################
#                        第一部分 合併實體名稱和類簇
 
lable = []       #存儲408個類標 20個類
content = []     #存儲408個實體名稱
name = source1.readline()
#老是多輸出空格 故設置0 1使其輸出一致
num = 1
while name!="":
    name = unicode(name.strip('\r\n'), "utf-8")
    if num == 1:
        res = source2.readline()
        res = res.strip('\r\n')
        
        value = res.split(' ')
        no = int(value[0]) - 1   #行號
        va = int(value[1])       #值
        lable.append(va)
        content.append(name)
        
        print name, res
        result1.write(name + ' ' + res + '\r\n')
        num = 0
    elif num == 0:
        num = 1
    name = source1.readline()
    
else:
    print 'OK'
    source1.close()
    source2.close()
    result1.close()
 
#測試輸出 其中實體名稱和類標一一對應
i = 0
while i < len(lable):
    print content[i], (i+1), lable[i]
    i = i + 1
 
#########################################################################
#                      第二部分 合併類簇 類1 ..... 類2 .....
 
#定義定長20字符串數組 對應20個類簇
output = ['']*20
result2 = codecs.open("ZBH_Cluster_Merge.txt", 'w', 'utf-8')
 
#統計類標對應的實體名稱
i = 0
while i < len(lable):
    output[lable[i]] += content[i] + ' ' 
    i = i + 1
 
#輸出
i = 0
while i < 20:
    print '#######'
    result2.write('#######\r\n')
    print 'Label: ' + str(i)
    result2.write('Label: ' + str(i) + '\r\n')
    print output[i]
    result2.write(output[i] + '\r\n')
    i = i + 1
 
result2.close()

輸出結果以下圖所示，其中label19能夠發現百度百科和互動百科的"大昭寺、法門寺"文本內容都劃分爲一類，同時也會存在一些錯誤的類別，如Label15中的"橘子洲"。

PS：若是你想進行準確率、迴歸率、F特徵值比較，能夠進一步去學習sklearn官方文檔。一般的文本數據集的類標如"教育、體育、娛樂"，把不一樣內容的新聞聚在一類，而這個略有區別，它主要是應用於我實際的畢設。

六. 總結與不足

Kmeans聚類是一種自下而上的聚類方法，它的優勢是簡單、速度快；缺點是聚類結果與初始中心的選擇有關係，且必須提供聚類的數目。
Kmeans的第二個缺點是致命的，由於在有些時候，咱們不知道樣本集將要聚成多少個類別，這種時候kmeans是不適合的，推薦使用hierarchical 或meanshift來聚類。第一個缺點能夠經過屢次聚類取最佳結果來解決。

推薦一些關於Kmeans及實驗評估的文章：
  淺談Kmeans聚類 - easymind223
  基於K－Means的文本聚類（強推基礎介紹） - freesum
  基於向量空間模型的文本聚類算法 - helld123
  KMeans文檔聚類python實現（代碼詳解） - skineffect
  Kmeans文本聚類系列之所有C++代碼 - finallyliuyu
  文本聚類—kmeans - zengkui111