利用python實現經過TF-IDF和BM25提取文章關鍵詞

時間 2019-11-16

標籤利用 python 實現經過 idf bm25 提取文章關鍵詞欄目 Python 简体版

原文原文鏈接

使用TF-IDF和BM25提取文章關鍵詞html

評估方法：算法

人工從文章中提取1-5個關鍵詞，和機器提取的關鍵詞作比較json

召回 = 機器提詞∩人工提詞 / 人工提詞app

準確 = 機器提詞∩人工提詞 / 機器提詞ide

TF-IDF性能

原理參考：https://www.py.cn/jishu/spider/11038.html測試

實現參考：tf-idf-keywordcode

其餘參考：使用不一樣的方法計算TF-IDF值orm

初版標題和正文加權計算tf-idf視頻

主要策略

（1）使用nlpc切詞服務（可用jieba切詞代替）+TF-IDF提取關鍵詞。

（2）去除停用詞

（3）按照體裁+年級分紅若干類型，來訓練模型，示例用高中+敘事類，取了20000條數據訓練

（4）對標題進行加權，標題的每一個詞彙頻率+6，再合一塊兒計算tf-idf

（5）按照權重取前4個關鍵詞，在這4個關鍵詞中對於權重小於頻率(5)*平均IDF/總詞數的進行過濾

注：以上數據均爲調節後最優解

相關推薦：《Python視頻教程》

代碼實現

config.py `program = 'composition_term_weight'

logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',

stream=sys.stderr,

                datefmt='%a, %d %b %Y %H:%M:%S')

logging.root.setLevel(level=logging.INFO)IDFLoader.pyclass IDFLoader(object):

"""詞典加載類"""

def __init__(self, idf_path):

    self.idf_path = idf_path

    self.idf_freq = {}  # idf

    self.mean_len = 0 #平均長度

    self.mean_idf = 0.0  # 均值

    self.load_idf()

def load_idf(self):

    """從文件中載入idf"""

    cnt = 0

    with open(self.idf_path, 'rb') as f:

        for line in f:

            try:

                word, freq = line.strip().decode('utf-8', errors='ignore').split(' ')

                if word == 'LEN_AVG':

                    self.mean_len = int(freq)

                    break

                self.idf_freq[word] = float(freq)

                cnt += 1

            except Exception as e:

                # logger.error('load_idf error: ' + e.message + ' line: ' + line.decode('utf-8', errors='ignore'))

                continue

    self.mean_idf = sum(self.idf_freq.values()) / cnt

    logger.info('Vocabularies %s loaded: %d mean_idf: %d' % (self.idf_path, cnt, self.mean_idf))`

`class TfIdf(object):

"""TF-IDF"""

# 對正文進行過濾

p_cut = re.compile(r'[a-zA-Z0-9]', re.VERBOSE)

# 對標題進行過濾

p_title = re.compile(r'做文|\d+字|.年級|_', re.VERBOSE)

# 過濾經常使用標點符號等，也能夠放到停用詞表中

ignored = ['', ' ', '', '。', '：', '，', '）', '（', '！', '?', '」', '「', '＂', '―', '．', '說', '好', '時']

# 主題最小出現次數，用於過濾權重不達標的關鍵詞

min_times = 5.0

# 標題加權次數

title_add_times = 6.0

# 取關鍵詞的個數

words_num = 4

def __init__(self):

    # 1. 獲取停用詞庫

    my_stop_words_path = 'stop_words.utf8.txt'

    self.stop_words_dict = []

    with open(my_stop_words_path, 'rb') as fr:

        for line in fr.readlines():

            self.stop_words_dict.append(line.strip())

def my_cut(self, inTxt):

    """切詞"""

    inTxt = self.p_cut.sub('', str(inTxt))

    words_list = []

    # 因爲性能問題，一句一句的切詞

    for l in inTxt.split('。'):

        # NLPC切詞服務，可用jieba切詞代替

        r = cut(l)

        if r is not None:

            words_list += r

    return [w for w in words_list if w not in self.stop_words_dict and w not in self.ignored and len(w.strip()) > 0]

def get_tfidf(self, idf_loader, title, content):

    """計算文章tf-idf"""

    filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))

    title_words = self.my_cut(filter_title)

    corpus0 = title_words + self.my_cut(content)

    freq = {}

    for w in corpus0:

        freq[w] = freq.get(w, 0.0) + 1.0

    # 對標題進行加權

    for w in title_words:

        logger.info(freq[w])

        freq[w] = freq.get(w, 0.0) + self.title_add_times

        logger.info(freq[w])

    total = sum(freq.values())

    for k in freq:  # 計算 TF-IDF

        freq[k] *= idf_loader.idf_freq.get(k, idf_loader.mean_idf) / total

    return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words

def get_term_weight(self, idf_loader, title, content):

    """得到term權重"""

    result, words_number, title_words = self.get_tfidf(idf_loader, title, content)

    bound = self.min_times * idf_loader.mean_idf / words_number

    machine_words = [item for item in result[:4] if item[1] > bound]

    # machine_words = [item for item in result[:self.words_num]]

    if len(machine_words) < 1:

        # 若是一個term都沒有，則把標題拿出來

        machine_words = [item for item in result if item[1] in title_words]

    data = []

    offset = 0

    for i, word in enumerate(machine_words):

        data.append('%s:%d:%s' % (word[0], offset, str(round(word[1], 4))))

        offset += len(word[0].decode('utf-8', errors='ignore'))

    return data

def getCorpus(self, data_path):

    """獲取詞表"""

    count = 0

    corpus_list = []

    with open(data_path, 'rb') as f:

        for line in f:

            info = json.loads(line.decode('utf-8', errors='ignore'))

            sentence = self.p_title.sub('', info.get('title').encode('utf-8', errors='ignore')) + '。' + info.get(

                '@merge_text').encode('utf-8', errors='ignore')

            r = self.my_cut(sentence)

            if not r:

                continue

            corpus_list.append(r)

            count += 1

            if count % 1000 == 0:

                logger.info("processd " + str(count) + " segment_sentence")

    return corpus_list

def train(self, dir_name, data_path):

    """訓練模型"""

    idf_path = 'data/%s/idf.txt' % dir_name

    documents = self.getCorpus(data_path)

    id_freq = {}

    i = 0

    len_sum = 0

    for doc in documents:

        len_sum += len(doc)

        doc = set(doc)

        for x in doc:

            id_freq[x] = id_freq.get(x, 0) + 1

        if i % 1000 == 0:

            logger.info('Documents processed: ' + str(i) + ', time: ' + str(datetime.datetime.now()))

        i += 1

    del documents

    with open(idf_path, 'wb') as f:

        for key, value in id_freq.items():

            f.write(key + ' ' + str(math.log(i / value, 2)) + '\n')

        logger.info(str(i) + ' ' + str(len_sum))

        f.write('LEN_AVG ' + str(len_sum / i))

def test_one(self, dir_name, method='tfidf'):

    """單個測試"""

    idf_loader = IDFLoader('data/%s/idf.txt' % dir_name)

    for item in sys.stdin:

        info = json.loads(item.decode('utf-8', errors='ignore'))

        title = info['title']

        content = info['@merge_text']

        if method == 'tfidf':

            result, words_number, title_words = self.get_tfidf(idf_loader, title, content)

        else:

            result, words_number, title_words = self.get_bm25(idf_loader, title, content)

        bound = self.min_times * idf_loader.mean_idf / words_number

        print '_____words_number bound_____'

        print words_number, bound

        print '_____tfidf_result_____'

        for item in result[:20]:

            print item[0].encode('utf-8', errors='ignore'), item[1]`

經調優，最優解爲：min_times=5 title_add_times=6.0 words_num=4

結果: 人工抽樣評估了100個

TF-IDF召回率：0.2778

TF-IDF準確率：0.2778 BM25

算法參考：搜索中的權重度量利器: TF-IDF和BM25

初版

TfIdf.py 增長方法： ` def get_bm25(self, idf_loader, title, content):

"""計算bm25"""

    k = 1.2  # 用來限制TF值的增加極限

    b = 0.75  # b是一個常數，它的做用是規定L對評分的影響有多大。

    # L是文檔長度與平均長度的比值

    EPSILON = 0.25  # 若是idf詞表中沒有，則平均idf*該值

    filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))

    title_words = self.my_cut(filter_title)

    corpus0 = title_words + self.my_cut(content)

    freq = {}

    for w in corpus0:

        freq[w] = freq.get(w, 0.0) + 1.0

    # 對標題進行加權

    for w in title_words:

        freq[w] = freq.get(w, 0.0) + self.title_add_times

    total = sum(freq.values())

    logger.info(str((k, b, total, idf_loader.mean_len)))

    for i in freq:

        tf = freq[i] / total

        idf = idf_loader.idf_freq.get(i, idf_loader.mean_idf * EPSILON)

        freq[i] = idf * ((k + 1) * tf) / (k * (1.0 - b + b * (total / idf_loader.mean_len)) + tf)

    return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words

` 經調優，最優解爲：min_times=2.5 title_add_times=6.0 words_num=4 k=1.2 b=0.75 EPSILON=0.25

結果

人工抽樣評估了100個

BM25召回率：0.2889

BM25準確率：0.3333