使用TF-IDF和BM25提取文章關鍵詞html
評估方法:算法
人工從文章中提取1-5個關鍵詞,和機器提取的關鍵詞作比較json
召回 = 機器提詞∩人工提詞 / 人工提詞app
準確 = 機器提詞∩人工提詞 / 機器提詞ide
TF-IDF性能
原理參考:https://www.py.cn/jishu/spider/11038.html測試
實現參考:tf-idf-keywordcode
其餘參考: 使用不一樣的方法計算TF-IDF值orm
初版 標題和正文加權計算tf-idf視頻
主要策略
(1)使用nlpc切詞服務(可用jieba切詞代替)+TF-IDF提取關鍵詞。
(2)去除停用詞
(3)按照體裁+年級分紅若干類型,來訓練模型,示例用高中+敘事類,取了20000條數據訓練
(4)對標題進行加權,標題的每一個詞彙頻率+6,再合一塊兒計算tf-idf
(5)按照權重取前4個關鍵詞,在這4個關鍵詞中對於權重小於 頻率(5)*平均IDF/總詞數 的進行過濾
注:以上數據均爲調節後最優解
相關推薦:《Python視頻教程》
代碼實現
config.py `program = 'composition_term_weight'
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
stream=sys.stderr, datefmt='%a, %d %b %Y %H:%M:%S')
logging.root.setLevel(level=logging.INFO)IDFLoader.py
class IDFLoader(object):
"""詞典加載類""" def __init__(self, idf_path): self.idf_path = idf_path self.idf_freq = {} # idf self.mean_len = 0 #平均長度 self.mean_idf = 0.0 # 均值 self.load_idf() def load_idf(self): """從文件中載入idf""" cnt = 0 with open(self.idf_path, 'rb') as f: for line in f: try: word, freq = line.strip().decode('utf-8', errors='ignore').split(' ') if word == 'LEN_AVG': self.mean_len = int(freq) break self.idf_freq[word] = float(freq) cnt += 1 except Exception as e: # logger.error('load_idf error: ' + e.message + ' line: ' + line.decode('utf-8', errors='ignore')) continue self.mean_idf = sum(self.idf_freq.values()) / cnt logger.info('Vocabularies %s loaded: %d mean_idf: %d' % (self.idf_path, cnt, self.mean_idf))`
`class TfIdf(object):
"""TF-IDF""" # 對正文進行過濾 p_cut = re.compile(r'[a-zA-Z0-9]', re.VERBOSE) # 對標題進行過濾 p_title = re.compile(r'做文|\d+字|.年級|_', re.VERBOSE) # 過濾經常使用標點符號等,也能夠放到停用詞表中 ignored = ['', ' ', '', '。', ':', ',', ')', '(', '!', '?', '」', '「', '"', '―', '.', '說', '好', '時'] # 主題最小出現次數,用於過濾權重不達標的關鍵詞 min_times = 5.0 # 標題加權次數 title_add_times = 6.0 # 取關鍵詞的個數 words_num = 4 def __init__(self): # 1. 獲取停用詞庫 my_stop_words_path = 'stop_words.utf8.txt' self.stop_words_dict = [] with open(my_stop_words_path, 'rb') as fr: for line in fr.readlines(): self.stop_words_dict.append(line.strip()) def my_cut(self, inTxt): """切詞""" inTxt = self.p_cut.sub('', str(inTxt)) words_list = [] # 因爲性能問題,一句一句的切詞 for l in inTxt.split('。'): # NLPC切詞服務,可用jieba切詞代替 r = cut(l) if r is not None: words_list += r return [w for w in words_list if w not in self.stop_words_dict and w not in self.ignored and len(w.strip()) > 0] def get_tfidf(self, idf_loader, title, content): """計算文章tf-idf""" filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore')) title_words = self.my_cut(filter_title) corpus0 = title_words + self.my_cut(content) freq = {} for w in corpus0: freq[w] = freq.get(w, 0.0) + 1.0 # 對標題進行加權 for w in title_words: logger.info(freq[w]) freq[w] = freq.get(w, 0.0) + self.title_add_times logger.info(freq[w]) total = sum(freq.values()) for k in freq: # 計算 TF-IDF freq[k] *= idf_loader.idf_freq.get(k, idf_loader.mean_idf) / total return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words def get_term_weight(self, idf_loader, title, content): """得到term權重""" result, words_number, title_words = self.get_tfidf(idf_loader, title, content) bound = self.min_times * idf_loader.mean_idf / words_number machine_words = [item for item in result[:4] if item[1] > bound] # machine_words = [item for item in result[:self.words_num]] if len(machine_words) < 1: # 若是一個term都沒有,則把標題拿出來 machine_words = [item for item in result if item[1] in title_words] data = [] offset = 0 for i, word in enumerate(machine_words): data.append('%s:%d:%s' % (word[0], offset, str(round(word[1], 4)))) offset += len(word[0].decode('utf-8', errors='ignore')) return data def getCorpus(self, data_path): """獲取詞表""" count = 0 corpus_list = [] with open(data_path, 'rb') as f: for line in f: info = json.loads(line.decode('utf-8', errors='ignore')) sentence = self.p_title.sub('', info.get('title').encode('utf-8', errors='ignore')) + '。' + info.get( '@merge_text').encode('utf-8', errors='ignore') r = self.my_cut(sentence) if not r: continue corpus_list.append(r) count += 1 if count % 1000 == 0: logger.info("processd " + str(count) + " segment_sentence") return corpus_list def train(self, dir_name, data_path): """訓練模型""" idf_path = 'data/%s/idf.txt' % dir_name documents = self.getCorpus(data_path) id_freq = {} i = 0 len_sum = 0 for doc in documents: len_sum += len(doc) doc = set(doc) for x in doc: id_freq[x] = id_freq.get(x, 0) + 1 if i % 1000 == 0: logger.info('Documents processed: ' + str(i) + ', time: ' + str(datetime.datetime.now())) i += 1 del documents with open(idf_path, 'wb') as f: for key, value in id_freq.items(): f.write(key + ' ' + str(math.log(i / value, 2)) + '\n') logger.info(str(i) + ' ' + str(len_sum)) f.write('LEN_AVG ' + str(len_sum / i)) def test_one(self, dir_name, method='tfidf'): """單個測試""" idf_loader = IDFLoader('data/%s/idf.txt' % dir_name) for item in sys.stdin: info = json.loads(item.decode('utf-8', errors='ignore')) title = info['title'] content = info['@merge_text'] if method == 'tfidf': result, words_number, title_words = self.get_tfidf(idf_loader, title, content) else: result, words_number, title_words = self.get_bm25(idf_loader, title, content) bound = self.min_times * idf_loader.mean_idf / words_number print '_____words_number bound_____' print words_number, bound print '_____tfidf_result_____' for item in result[:20]: print item[0].encode('utf-8', errors='ignore'), item[1]`
經調優,最優解爲:min_times=5 title_add_times=6.0 words_num=4
結果: 人工抽樣評估了100個
TF-IDF召回率:0.2778
TF-IDF準確率:0.2778 BM25
算法參考: 搜索中的權重度量利器: TF-IDF和BM25
初版
TfIdf.py 增長方法: ` def get_bm25(self, idf_loader, title, content):
"""計算bm25""" k = 1.2 # 用來限制TF值的增加極限 b = 0.75 # b是一個常數,它的做用是規定L對評分的影響有多大。 # L是文檔長度與平均長度的比值 EPSILON = 0.25 # 若是idf詞表中沒有,則平均idf*該值 filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore')) title_words = self.my_cut(filter_title) corpus0 = title_words + self.my_cut(content) freq = {} for w in corpus0: freq[w] = freq.get(w, 0.0) + 1.0 # 對標題進行加權 for w in title_words: freq[w] = freq.get(w, 0.0) + self.title_add_times total = sum(freq.values()) logger.info(str((k, b, total, idf_loader.mean_len))) for i in freq: tf = freq[i] / total idf = idf_loader.idf_freq.get(i, idf_loader.mean_idf * EPSILON) freq[i] = idf * ((k + 1) * tf) / (k * (1.0 - b + b * (total / idf_loader.mean_len)) + tf) return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words
` 經調優,最優解爲:min_times=2.5 title_add_times=6.0 words_num=4 k=1.2 b=0.75 EPSILON=0.25
結果
人工抽樣評估了100個
BM25召回率:0.2889
BM25準確率:0.3333