import math
import jieba.analyse
from simhash import Simhash
def gen_post_simhash(body, topK=10, withWeight=True):
tags = jieba.analyse.extract_tags(body, topK=topK, withWeight=withWeight)
sim = Simhash(tags, f=56)
return sim.value
def distance_post_simhash(simhash_value1, simhash_value2, f=56):
x = (simhash_value1 ^ simhash_value2) & ((1 << f) - 1)
ans = 0
while x:
ans += 1
x &= x - 1
return ans
def dot_product2(v1, v2):
return sum(map(lambda x, y: x * y, v1, v2))
def vector_cos5(v1, v2):
prod = dot_product2(v1, v2)
len1 = math.sqrt(dot_product2(v1, v1))
len2 = math.sqrt(dot_product2(v2, v2))
return prod / (len1 * len2)
def similar_ratio_to_two_texts(body1, body2):
""" 餘弦類似度 http://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists """
topK = 10
tag1s = jieba.analyse.extract_tags(body1, topK=topK, withWeight=True)
tag2s = jieba.analyse.extract_tags(body2, topK=topK, withWeight=True)
tag1s_dict = dict(tag1s)
tag2s_dict = dict(tag2s)
all_tags = set([t[0] for t in tag2s] + [t[0] for t in tag1s])
vec1 = [tag1s_dict.get(tag, 0) for tag in all_tags]
vec2 = [tag2s_dict.get(tag, 0) for tag in all_tags]
similar = vector_cos5(vec1, vec2)
return similar
複製代碼