single-pass單遍聚類方法

一.一般關於文本聚類也都是針對已有的一堆歷史數據進行聚類，好比經常使用的方法有kmeans,dbscan等。若是有個需求須要針對流式文本進行聚類(即來一條聚一條)，那麼這些方法都不太適用了，固然也有不少其它針對流式數據進行動態聚類方法，動態聚類也有不少挑戰，好比聚類個數是不固定的，聚類的類似閾值也很差設。這些都有待繼續研究下去。本文實現一個簡單single-pass單遍聚類方法，文本間的類似度是利用餘弦距離，文本向量能夠用tfidf(這裏的idf能夠在一個大的文檔集裏統計獲得，而後在新的文本中的詞直接利用)，也能夠用一些如word2vec,bert等中文預訓練模型對文本進行向量表示。app
二.程序ui
 1 import numpy as np  2 import os  3 import sys  4 import pickle  5 import collections  6 from sklearn.feature_extraction.text import TfidfVectorizer  7 from sklearn.decomposition import TruncatedSVD  8 from gensim import corpora, models, matutils  9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list  10 from gensim.models import doc2vec, Doc2Vec  11 from sklearn.metrics.pairwise import cosine_similarity  12 
 13 '''
 14 大致流程：  15 input：doc vector;threshold  16 output:cluster  17 begin  18  input doc vector  19  input threshold  20  first doc as first cluster and it's vector as the center of the cluster  21  while(doc vectors){  22  while(clusters){  23  max_sim,max_cluster = simlarity(doc vector,cluster);  24  }  25  if(max_sim > threshold){  26  max_cluster.put(doc vector);  27  max_cluster.update_center()  28  }  29  else{  30  build new cluster(doc vector);  31  }  32  }  33 end  34 '''
 35 class SingelPassCluster(object):  36 
 37     '''
 38  1.利用tfidf vec計算cossim  39     '''
 40     def tfidf_vec(self, corpus, pivot=10, slope=0.25):  41         dictionary = corpora.Dictionary(corpus)  # 造成詞典映射
 42         self.dict_size = len(dictionary)  43         print('dictionary size:{}'.format(len(dictionary)))  44         corpus = [dictionary.doc2bow(text) for text in corpus]  # 詞的向量表示
 45         tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope)  46         corpus_tfidf = tfidf[corpus]  47         return corpus_tfidf  48 
 49     def get_max_similarity(self, cluster_cores, vector):  50         max_value = 0  51         max_index = -1
 52         print('vector:{}'.format(vector))  53         for k, core in cluster_cores.items():  54             print('core:{}'.format(core))  55             similarity = matutils.cossim(vector, core)  56             if similarity > max_value:  57                 max_value = similarity  58                 max_index = k  59         return max_index, max_value  60 
 61     def single_pass(self, corpus_vec, corpus, theta):  62         clusters = {}  63         cluster_cores = {}  64         cluster_text = {}  65         num_topic = 0  66         cnt = 0  67         for vector, text in zip(corpus_vec, corpus):  68             if num_topic == 0:  69  clusters.setdefault(num_topic, []).append(vector)  70                 cluster_cores[num_topic] = vector  71  cluster_text.setdefault(num_topic, []).append(text)  72                 num_topic += 1
 73             else:  74                 max_index, max_value = self.get_max_similarity(cluster_cores, vector)  75                 if max_value > theta:  76  clusters[max_index].append(vector)  77                     text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,  78                                                         num_docs=len(clusters[max_index])).T  # 稀疏轉稠密
 79                     core = np.mean(text_matrix, axis=0)  # 更新簇中心
 80                     core = matutils.any2sparse(core)  # 將稠密向量core轉爲稀疏向量
 81                     cluster_cores[max_index] = core  82  cluster_text[max_index].append(text)  83                 else:  # 建立一個新簇
 84  clusters.setdefault(num_topic, []).append(vector)  85                     cluster_cores[num_topic] = vector  86  cluster_text.setdefault(num_topic, []).append(text)  87                     num_topic += 1
 88             cnt += 1
 89             if cnt % 100 == 0:  90                 print('processing {}...'.format(cnt))  91         return clusters, cluster_text  92 
 93     def fit_transform(self, corpus, raw_data, theta=0.5):  94         tfidf_vec = self.tfidf_vec(corpus)  # tfidf_vec是稀疏向量
 95         clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)  96         return clusters, cluster_text  97 
 98 
 99     '''
100  2.利用doc2vec計算cossim 101     '''
102     def fit(self, doc2vec_model, corpus, raw_data, theta=0.5): 103         doc_vec = self.doc_vec(doc2vec_model, corpus) 104         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta) 105         return clusters, cluster_text 106 
107     def fit_2(self, doc_vec, text2index, theta): 108         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta) 109         return clusters, cluster_text 110 
111     def doc_vec(self, doc2vec_model, x_train): 112         print('doc2vec infered vec...') 113         infered_vectors_list = [] 114         for text, label in x_train: 115             vector = doc2vec_model.infer_vector(text) 116  infered_vectors_list.append(vector) 117             print('infered vector size:{}'.format(len(infered_vectors_list))) 118             if len(infered_vectors_list) >= 100: 119                 break
120         return infered_vectors_list 121 
122     def get_doc2vec_similarity(self, cluster_cores, vector): 123         max_value = 0 124         max_index = -1
125         for k, core in cluster_cores.items():  # core -> np.ndarray
126             similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1)) 127             similarity = similarity[0, 0] 128             if similarity > max_value: 129                 max_value = similarity 130                 max_index = k 131         return max_index, max_value 132 
133     def doc2vec_single_pass(self, corpus_vec, corpus, theta): 134         clusters = {} 135         cluster_cores = {} 136         cluster_text = {} 137         num_topic = 0 138         cnt = 0 139         for vector, text in zip(corpus_vec, corpus): 140             if num_topic == 0: 141  clusters.setdefault(num_topic, []).append(vector) 142                 cluster_cores[num_topic] = vector 143  cluster_text.setdefault(num_topic, []).append(text) 144                 num_topic += 1
145             else: 146                 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector) 147                 if max_value > theta: 148  clusters[max_index].append(vector) 149                     core = np.mean(clusters[max_index], axis=0)  # 更新簇中心
150                     cluster_cores[max_index] = core 151  cluster_text[max_index].append(text) 152                 else:  # 建立一個新簇
153  clusters.setdefault(num_topic, []).append(vector) 154                     cluster_cores[num_topic] = vector 155  cluster_text.setdefault(num_topic, []).append(text) 156                     num_topic += 1
157             cnt += 1
158             if cnt % 100 == 0: 159                 print('processing {}...'.format(cnt)) 160         return clusters, cluster_text 161 
162 
163 def sim(doc_vec): 164     vector = doc_vec[0] 165     print('vector:{}'.format(type(vector))) 166     for core in doc_vec: 167         similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1)) 168         similarity = similarity[0, 0] 169         print("similarity:{}".format(similarity)) 170 
171 if __name__ == '__main__': 172     base_path = os.path.abspath(os.path.join(os.getcwd(), '../..')) 173     process_text = base_path + '/data/process_text.txt'  # 處理後的樣本路徑
174     doc2vec_path = base_path + '/data/doc2vec.pkl'
175     cluster_result = base_path + '/data/cluster_result.txt'
176     doc_vec_path = base_path + '/data/doc_vec.vec'  # 通過doc2vec推薦的文本向量
177 
178     corpus = load_data(process_text) 179     raw_text = load_samples(process_text) 180 
181     index2corpus = collections.OrderedDict() 182     for index, line in enumerate(raw_text): 183         index2corpus[index] = line 184     text2index = list(index2corpus.keys()) 185     print('docs total size:{}'.format(len(text2index))) 186 
187     single_cluster = SingelPassCluster() 188 
189     cal_vec_type = 'doc2vec'
190 
191     if cal_vec_type == 'tfidf': 192         clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4) 193 
194     if cal_vec_type == 'doc2vec': 195         with open(doc_vec_path, 'rb') as file: 196             infered_vectors_list = pickle.load(file) 197         clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6) 198 
199         '''
200  if os.path.exists(doc2vec_path): 201  print('doc2vec model loading...') 202  doc2vec_model = Doc2Vec.load(doc2vec_path) 203  x_train = read_data_to_list(process_text) 204  clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6) 205         '''
206 
207     if cal_vec_type == 'd2vsim': 208         if os.path.exists(doc2vec_path): 209             print('doc2vec model loading...') 210             doc2vec_model = Doc2Vec.load(doc2vec_path) 211         x_train = read_data_to_list(process_text) 212         doc_vec = single_cluster.doc_vec(doc2vec_model, x_train) 213  sim(doc_vec) 214 
215 
216     print("............................................................................................") 217     print("獲得的類數量有: {} 個 ...".format(len(clusters))) 218     print("............................................................................................\n") 219     # 按聚類語句數量對聚類結果進行降序排列
220     clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True) 221     with open(cluster_result, 'w', encoding='utf-8') as file_write: 222         for k in clusterTopic_list: 223             cluster_text = [] 224             for index, value in enumerate(k[1],start=1): 225                 cluster_text.append('(' + str(index) + '): ' + index2corpus[value]) 226             cluster_text = '\n'.join(cluster_text) 227             file_write.write("【簇索引】:{} \n【簇中文檔數】：{} \n【簇中文檔】 ：\n{}".format(k[0], len(k[1]), cluster_text)) 228             file_write.write('\n') 229             file_write.flush()