import mathfrom math import isnanimport pandas as pd#結巴分詞,切開以後,有分隔符def jieba_function(sent): import jieba sent1 = jieba.cut(sent) s = [] for each in sent1: s.append(each) return ' '.join(str(i) for i in s)def count_cos_similarity(vec_1, vec_2): if len(vec_1) != len(vec_2): return 0 s = sum(vec_1[i] * vec_2[i] for i in range(len(vec_2))) den1 = math.sqrt(sum([pow(number, 2) for number in vec_1])) den2 = math.sqrt(sum([pow(number, 2) for number in vec_2])) return s / (den1 * den2)#計算文本向量,傳入文本,接受的是字符串def tf(sent1, sent2): from sklearn.feature_extraction.text import CountVectorizer sent1 = jieba_function(sent1) sent2 = jieba_function(sent2) count_vec = CountVectorizer() sentences = [sent1, sent2] print('sentences',sentences) print('vector',count_vec.fit_transform(sentences).toarray())## 輸出特徵向量化後的表示 print('cut_word',count_vec.get_feature_names())#輸出的是切分的詞, 輸出向量各個維度的特徵含義 #轉換成維度相同的 vec_1 = count_vec.fit_transform(sentences).toarray()[0] vec_2 = count_vec.fit_transform(sentences).toarray()[1] similarity=count_cos_similarity(vec_1, vec_2) if isnan(similarity): similarity=0.0 print('count_cos_similarity',similarity)def tfidf(sent1, sent2): from sklearn.feature_extraction.text import TfidfVectorizer sent1 = jieba_function(sent1) sent2 = jieba_function(sent2) tfidf_vec = TfidfVectorizer() sentences = [sent1, sent2] vec_1 = tfidf_vec.fit_transform(sentences).toarray()[0] vec_2 = tfidf_vec.fit_transform(sentences).toarray()[1] similarity=count_cos_similarity(vec_1, vec_2) if isnan(similarity): similarity=0.0 return similarityif __name__=='__main__': sent1 = '我喜歡看電視也喜歡看電影,' sent2 = '我不喜歡看電視也不喜歡看電影' print('<<<<tf<<<<<<<') tf(sent1, sent2) print('<<<<tfidf<<<<<<<') tfidf(sent1, sent2)