句子類似度_tf/idf

import mathfrom math import isnanimport pandas as pd#結巴分詞,切開以後,有分隔符def jieba_function(sent):    import jieba    sent1 = jieba.cut(sent)    s = []    for each in sent1:        s.append(each)    return ' '.join(str(i) for i in s)def count_cos_similarity(vec_1, vec_2):    if len(vec_1) != len(vec_2):        return 0    s = sum(vec_1[i] * vec_2[i] for i in range(len(vec_2)))    den1 = math.sqrt(sum([pow(number, 2) for number in vec_1]))    den2 = math.sqrt(sum([pow(number, 2) for number in vec_2]))    return s / (den1 * den2)#計算文本向量,傳入文本,接受的是字符串def tf(sent1, sent2):    from sklearn.feature_extraction.text import CountVectorizer    sent1 = jieba_function(sent1)    sent2 = jieba_function(sent2)    count_vec = CountVectorizer()    sentences = [sent1, sent2]    print('sentences',sentences)    print('vector',count_vec.fit_transform(sentences).toarray())## 輸出特徵向量化後的表示    print('cut_word',count_vec.get_feature_names())#輸出的是切分的詞, 輸出向量各個維度的特徵含義    #轉換成維度相同的    vec_1 = count_vec.fit_transform(sentences).toarray()[0]    vec_2 = count_vec.fit_transform(sentences).toarray()[1]    similarity=count_cos_similarity(vec_1, vec_2)    if isnan(similarity):        similarity=0.0    print('count_cos_similarity',similarity)def tfidf(sent1, sent2):    from sklearn.feature_extraction.text import TfidfVectorizer    sent1 = jieba_function(sent1)    sent2 = jieba_function(sent2)    tfidf_vec = TfidfVectorizer()    sentences = [sent1, sent2]    vec_1 = tfidf_vec.fit_transform(sentences).toarray()[0]    vec_2 = tfidf_vec.fit_transform(sentences).toarray()[1]    similarity=count_cos_similarity(vec_1, vec_2)    if isnan(similarity):        similarity=0.0    return similarityif __name__=='__main__':    sent1 = '我喜歡看電視也喜歡看電影,'    sent2 = '我不喜歡看電視也不喜歡看電影'    print('<<<<tf<<<<<<<')    tf(sent1, sent2)    print('<<<<tfidf<<<<<<<')    tfidf(sent1, sent2)
相關文章
相關標籤/搜索