single-pass單遍聚類方法

一.一般關於文本聚類也都是針對已有的一堆歷史數據進行聚類,好比經常使用的方法有kmeans,dbscan等。若是有個需求須要針對流式文本進行聚類(即來一條聚一條),那麼這些方法都不太適用了,固然也有不少其它針對流式數據進行動態聚類方法,動態聚類也有不少挑戰,好比聚類個數是不固定的,聚類的類似閾值也很差設。這些都有待繼續研究下去。本文實現一個簡單sing-pass單遍聚類方法,文本間的類似度是利用餘弦距離,文本向量能夠用tfidf(這裏的idf能夠在一個大的文檔集裏統計獲得,而後在新的文本中的詞直接利用),也能夠用一些如word2vec,bert等中文預訓練模型對文本進行向量表示。app

二.程序ui

  1 import numpy as np
  2 import os
  3 import sys
  4 import pickle
  5 import collections
  6 from sklearn.feature_extraction.text import TfidfVectorizer
  7 from sklearn.decomposition import TruncatedSVD
  8 from gensim import corpora, models, matutils
  9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list
 10 from gensim.models import doc2vec, Doc2Vec
 11 from sklearn.metrics.pairwise import cosine_similarity
 12 
 13 '''
 14 大致流程:
 15 input:doc vector;threshold
 16 output:cluster
 17 begin
 18     input doc vector
 19     input threshold
 20     first doc as first cluster and it's vector as the center of the cluster
 21     while(doc vectors){
 22         while(clusters){
 23             max_sim,max_cluster = simlarity(doc vector,cluster);
 24         }
 25         if(max_sim > threshold){
 26             max_cluster.put(doc vector);
 27             max_cluster.update_center()
 28         }
 29         else{
 30             build new cluster(doc vector);
 31         }
 32     }
 33 end
 34 '''
 35 class SingelPassCluster(object):
 36 
 37     '''
 38         1.利用tfidf vec計算cossim
 39     '''
 40     def tfidf_vec(self, corpus, pivot=10, slope=0.25):
 41         dictionary = corpora.Dictionary(corpus)  # 造成詞典映射
 42         self.dict_size = len(dictionary)
 43         print('dictionary size:{}'.format(len(dictionary)))
 44         corpus = [dictionary.doc2bow(text) for text in corpus]  # 詞的向量表示
 45         tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope)
 46         corpus_tfidf = tfidf[corpus]
 47         return corpus_tfidf
 48 
 49     def get_max_similarity(self, cluster_cores, vector):
 50         max_value = 0
 51         max_index = -1
 52         print('vector:{}'.format(vector))
 53         for k, core in cluster_cores.items():
 54             print('core:{}'.format(core))
 55             similarity = matutils.cossim(vector, core)
 56             if similarity > max_value:
 57                 max_value = similarity
 58                 max_index = k
 59         return max_index, max_value
 60 
 61     def single_pass(self, corpus_vec, corpus, theta):
 62         clusters = {}
 63         cluster_cores = {}
 64         cluster_text = {}
 65         num_topic = 0
 66         cnt = 0
 67         for vector, text in zip(corpus_vec, corpus):
 68             if num_topic == 0:
 69                 clusters.setdefault(num_topic, []).append(vector)
 70                 cluster_cores[num_topic] = vector
 71                 cluster_text.setdefault(num_topic, []).append(text)
 72                 num_topic += 1
 73             else:
 74                 max_index, max_value = self.get_max_similarity(cluster_cores, vector)
 75                 if max_value > theta:
 76                     clusters[max_index].append(vector)
 77                     text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,
 78                                                         num_docs=len(clusters[max_index])).T  # 稀疏轉稠密
 79                     core = np.mean(text_matrix, axis=0)  # 更新簇中心
 80                     core = matutils.any2sparse(core)  # 將稠密向量core轉爲稀疏向量
 81                     cluster_cores[max_index] = core
 82                     cluster_text[max_index].append(text)
 83                 else:  # 建立一個新簇
 84                     clusters.setdefault(num_topic, []).append(vector)
 85                     cluster_cores[num_topic] = vector
 86                     cluster_text.setdefault(num_topic, []).append(text)
 87                     num_topic += 1
 88             cnt += 1
 89             if cnt % 100 == 0:
 90                 print('processing {}...'.format(cnt))
 91         return clusters, cluster_text
 92 
 93     def fit_transform(self, corpus, raw_data, theta=0.5):
 94         tfidf_vec = self.tfidf_vec(corpus)  # tfidf_vec是稀疏向量
 95         clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)
 96         return clusters, cluster_text
 97 
 98 
 99     '''
100         2.利用doc2vec計算cossim
101     '''
102     def fit(self, doc2vec_model, corpus, raw_data, theta=0.5):
103         doc_vec = self.doc_vec(doc2vec_model, corpus)
104         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta)
105         return clusters, cluster_text
106 
107     def fit_2(self, doc_vec, text2index, theta):
108         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta)
109         return clusters, cluster_text
110 
111     def doc_vec(self, doc2vec_model, x_train):
112         print('doc2vec infered vec...')
113         infered_vectors_list = []
114         for text, label in x_train:
115             vector = doc2vec_model.infer_vector(text)
116             infered_vectors_list.append(vector)
117             print('infered vector size:{}'.format(len(infered_vectors_list)))
118             if len(infered_vectors_list) >= 100:
119                 break
120         return infered_vectors_list
121 
122     def get_doc2vec_similarity(self, cluster_cores, vector):
123         max_value = 0
124         max_index = -1
125         for k, core in cluster_cores.items():  # core -> np.ndarray
126             similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1))
127             similarity = similarity[0, 0]
128             if similarity > max_value:
129                 max_value = similarity
130                 max_index = k
131         return max_index, max_value
132 
133     def doc2vec_single_pass(self, corpus_vec, corpus, theta):
134         clusters = {}
135         cluster_cores = {}
136         cluster_text = {}
137         num_topic = 0
138         cnt = 0
139         for vector, text in zip(corpus_vec, corpus):
140             if num_topic == 0:
141                 clusters.setdefault(num_topic, []).append(vector)
142                 cluster_cores[num_topic] = vector
143                 cluster_text.setdefault(num_topic, []).append(text)
144                 num_topic += 1
145             else:
146                 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector)
147                 if max_value > theta:
148                     clusters[max_index].append(vector)
149                     core = np.mean(clusters[max_index], axis=0)  # 更新簇中心
150                     cluster_cores[max_index] = core
151                     cluster_text[max_index].append(text)
152                 else:  # 建立一個新簇
153                     clusters.setdefault(num_topic, []).append(vector)
154                     cluster_cores[num_topic] = vector
155                     cluster_text.setdefault(num_topic, []).append(text)
156                     num_topic += 1
157             cnt += 1
158             if cnt % 100 == 0:
159                 print('processing {}...'.format(cnt))
160         return clusters, cluster_text
161 
162 
163 def sim(doc_vec):
164     vector = doc_vec[0]
165     print('vector:{}'.format(type(vector)))
166     for core in doc_vec:
167         similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1))
168         similarity = similarity[0, 0]
169         print("similarity:{}".format(similarity))
170 
171 if __name__ == '__main__':
172     base_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
173     process_text = base_path + '/data/process_text.txt'  # 處理後的樣本路徑
174     doc2vec_path = base_path + '/data/doc2vec.pkl'
175     cluster_result = base_path + '/data/cluster_result.txt'
176     doc_vec_path = base_path + '/data/doc_vec.vec'  # 通過doc2vec推薦的文本向量
177 
178     corpus = load_data(process_text)
179     raw_text = load_samples(process_text)
180 
181     index2corpus = collections.OrderedDict()
182     for index, line in enumerate(raw_text):
183         index2corpus[index] = line
184     text2index = list(index2corpus.keys())
185     print('docs total size:{}'.format(len(text2index)))
186 
187     single_cluster = SingelPassCluster()
188 
189     cal_vec_type = 'doc2vec'
190 
191     if cal_vec_type == 'tfidf':
192         clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4)
193 
194     if cal_vec_type == 'doc2vec':
195         with open(doc_vec_path, 'rb') as file:
196             infered_vectors_list = pickle.load(file)
197         clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6)
198 
199         '''
200         if os.path.exists(doc2vec_path):
201             print('doc2vec model loading...')
202             doc2vec_model = Doc2Vec.load(doc2vec_path)
203         x_train = read_data_to_list(process_text)
204         clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6)
205         '''
206 
207     if cal_vec_type == 'd2vsim':
208         if os.path.exists(doc2vec_path):
209             print('doc2vec model loading...')
210             doc2vec_model = Doc2Vec.load(doc2vec_path)
211         x_train = read_data_to_list(process_text)
212         doc_vec = single_cluster.doc_vec(doc2vec_model, x_train)
213         sim(doc_vec)
214 
215 
216     print("............................................................................................")
217     print("獲得的類數量有: {} 個 ...".format(len(clusters)))
218     print("............................................................................................\n")
219     # 按聚類語句數量對聚類結果進行降序排列
220     clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True)
221     with open(cluster_result, 'w', encoding='utf-8') as file_write:
222         for k in clusterTopic_list:
223             cluster_text = []
224             for index, value in enumerate(k[1],start=1):
225                 cluster_text.append('(' + str(index) + '): ' + index2corpus[value])
226             cluster_text = '\n'.join(cluster_text)
227             file_write.write("【簇索引】:{} \n【簇中文檔數】:{} \n【簇中文檔】 :\n{}".format(k[0], len(k[1]), cluster_text))
228             file_write.write('\n')
229             file_write.flush()
相關文章
相關標籤/搜索