NLP（二十四）利用ALBERT實現命名實體識別

時間 2020-03-12

標籤 nlp 二十四利用 albert 實現命名實體識別简体版

原文原文鏈接

本文將會介紹如何利用ALBERT來實現命名實體識別。若是有對命名實體識別不清楚的讀者，請參考筆者的文章NLP入門（四）命名實體識別（NER）。
本文的項目結構以下：

其中，albert_zh爲ALBERT提取文本特徵模塊，這方面的代碼已經由別人開源，咱們只須要拿來使用便可。data目錄下爲咱們本次講解所須要的數據，圖中只有example開頭的數據集，這是人民日報的標註語料，實體爲人名（PER）、地名（LOC）和組織機構名（ORG）。數據集一行一個字符以及標註符號，標註系統採用BIO系統，咱們以example.train的第一句爲例，標註信息以下：python

海 O
釣 O
比 O
賽 O
地 O
點 O
在 O
廈 B-LOC
門 I-LOC
與 O
金 B-LOC
門 I-LOC
之 O
間 O
的 O
海 O
域 O
。 O

在utils.py文件中，配置了一些關於文件路徑和模型參數方面的信息，其中規定了輸入的文本長度最大爲128，代碼以下：git

# -*- coding: utf-8 -*-
# author: Jclian91
# place: Pudong Shanghai
# time: 2020-03-11 21:12

# 數據相關的配置
event_type = "example"

train_file_path = "./data/%s.train" % event_type
dev_file_path = "./data/%s.dev" % event_type
test_file_path = "./data/%s.test" % event_type

# 模型相關的配置
MAX_SEQ_LEN = 128   # 輸入的文本最大長度

在load_data.py文件中，咱們將處理訓練集、驗證集和測試集數據，並將標籤轉換爲id，造成label2id.json文件，代碼以下：github

# -*- coding: utf-8 -*-
# author: Jclian91
# place: Pudong Shanghai
# time: 2020-03-11 10:04
import json

from utils import train_file_path, event_type


# 讀取數據集
def read_data(file_path):
    # 讀取數據集
    with open(file_path, "r", encoding="utf-8") as f:
        content = [_.strip() for _ in f.readlines()]

    # 添加原文句子以及該句子的標籤

    # 讀取空行所在的行號
    index = [-1]
    index.extend([i for i, _ in enumerate(content) if ' ' not in _])
    index.append(len(content))

    # 按空行分割，讀取原文句子及標註序列
    sentences, tags = [], []
    for j in range(len(index)-1):
        sent, tag = [], []
        segment = content[index[j]+1: index[j+1]]
        for line in segment:
            sent.append(line.split()[0])
            tag.append(line.split()[-1])

        sentences.append(''.join(sent))
        tags.append(tag)

    # 去除空的句子及標註序列，通常放在末尾
    sentences = [_ for _ in sentences if _]
    tags = [_ for _ in tags if _]

    return sentences, tags


# 讀取訓練集數據
# 將標籤轉換成id
def label2id():

    train_sents, train_tags = read_data(train_file_path)

    # 標籤轉換成id，並保存成文件
    unique_tags = []
    for seq in train_tags:
        for _ in seq:
            if _ not in unique_tags:
                unique_tags.append(_)

    label_id_dict = dict(zip(unique_tags, range(1, len(unique_tags) + 1)))

    with open("%s_label2id.json" % event_type, "w", encoding="utf-8") as g:
        g.write(json.dumps(label_id_dict, ensure_ascii=False, indent=2))


if __name__ == '__main__':
    label2id()

運行代碼，生成的example_label2id.json文件以下：算法

{
  "O": 1,
  "B-LOC": 2,
  "I-LOC": 3,
  "B-PER": 4,
  "I-PER": 5,
  "B-ORG": 6,
  "I-ORG": 7
}

生成該文件是爲了方便咱們後邊的模型訓練和預測的時候調用。
接着就是最重要的模型訓練部分了，模型的結構圖以下：

咱們採用ALBERT做爲文本特徵提取，後接經典的序列標註算法——Bi-LSTM算法。albert_model_train.py的完整代碼以下：json

# -*- coding: utf-8 -*-

import json
import numpy as np
from keras.models import Model, Input
from keras.layers import Dense, Bidirectional, Dropout, LSTM, TimeDistributed, Masking
from keras.utils import to_categorical, plot_model
from seqeval.metrics import classification_report
import matplotlib.pyplot as plt

from utils import event_type
from utils import MAX_SEQ_LEN, train_file_path, test_file_path, dev_file_path
from load_data import read_data
from albert_zh.extract_feature import BertVector

# 利用ALBERT提取文本特徵
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN)
f = lambda text: bert_model.encode([text])["encodes"][0]

# 讀取label2id字典
with open("%s_label2id.json" % event_type, "r", encoding="utf-8") as h:
    label_id_dict = json.loads(h.read())

id_label_dict = {v:k for k,v in label_id_dict.items()}


# 載入數據
def input_data(file_path):

    sentences, tags = read_data(file_path)
    print("sentences length: %s " % len(sentences))
    print("last sentence: ", sentences[-1])

    # ALBERT ERCODING
    print("start ALBERT encding")
    x = np.array([f(sent) for sent in sentences])
    print("end ALBERT encoding")

    # 對y值統一長度爲MAX_SEQ_LEN
    new_y = []
    for seq in tags:
        num_tag = [label_id_dict[_] for _ in seq]
        if len(seq) < MAX_SEQ_LEN:
            num_tag = num_tag + [0] * (MAX_SEQ_LEN-len(seq))
        else:
            num_tag = num_tag[: MAX_SEQ_LEN]

        new_y.append(num_tag)

    # 將y中的元素編碼成ont-hot encoding
    y = np.empty(shape=(len(tags), MAX_SEQ_LEN, len(label_id_dict.keys())+1))

    for i, seq in enumerate(new_y):
        y[i, :, :] = to_categorical(seq, num_classes=len(label_id_dict.keys())+1)

    return x, y


# Build model
def build_model(max_para_length, n_tags):
    # Bert Embeddings
    bert_output = Input(shape=(max_para_length, 312, ), name="bert_output")
    # LSTM model
    lstm = Bidirectional(LSTM(units=128, return_sequences=True), name="bi_lstm")(bert_output)
    drop = Dropout(0.1, name="dropout")(lstm)
    out = TimeDistributed(Dense(n_tags, activation="softmax"), name="time_distributed")(drop)
    model = Model(inputs=bert_output, outputs=out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # 模型結構總結
    model.summary()
    plot_model(model, to_file="albert_bi_lstm.png", show_shapes=True)

    return model


# 模型訓練
def train_model():

    # 讀取訓練集，驗證集和測試集數據
    train_x, train_y = input_data(train_file_path)
    dev_x, dev_y = input_data(dev_file_path)
    test_x, test_y = input_data(test_file_path)

    # 模型訓練
    model = build_model(MAX_SEQ_LEN, len(label_id_dict.keys())+1)

    history = model.fit(train_x, train_y, validation_data=(dev_x, dev_y), batch_size=32, epochs=10)

    model.save("%s_ner.h5" % event_type)

    # 繪製loss和acc圖像
    plt.subplot(2, 1, 1)
    epochs = len(history.history['loss'])
    plt.plot(range(epochs), history.history['loss'], label='loss')
    plt.plot(range(epochs), history.history['val_loss'], label='val_loss')
    plt.legend()

    plt.subplot(2, 1, 2)
    epochs = len(history.history['acc'])
    plt.plot(range(epochs), history.history['acc'], label='acc')
    plt.plot(range(epochs), history.history['val_acc'], label='val_acc')
    plt.legend()
    plt.savefig("%s_loss_acc.png" % event_type)

    # 模型在測試集上的表現
    # 預測標籤
    y = np.argmax(model.predict(test_x), axis=2)
    pred_tags = []
    for i in range(y.shape[0]):
        pred_tags.append([id_label_dict[_] for _ in y[i] if _])

    # 由於存在預測的標籤長度與原來的標註長度不一致的狀況，所以須要調整預測的標籤
    test_sents, test_tags = read_data(test_file_path)
    final_tags = []
    for test_tag, pred_tag in zip(test_tags, pred_tags):
        if len(test_tag) == len(pred_tag):
            final_tags.append(test_tag)
        elif len(test_tag) < len(pred_tag):
            final_tags.append(pred_tag[:len(test_tag)])
        else:
            final_tags.append(pred_tag + ['O'] * (len(test_tag) - len(pred_tag)))

    # 利用seqeval對測試集進行驗證
    print(classification_report(test_tags, final_tags, digits=4))


if __name__ == '__main__':
    train_model()

模型訓練過程當中的輸出結果以下（部分輸出省略）：微信

sentences length: 20864 
last sentence:  思想自由是對自我而言，用中國傳統的說法是有所爲；兼容幷包是指對待他人，要有所不爲。
start ALBERT encding
end ALBERT encoding
sentences length: 2318 
last sentence:  良性腫瘤、惡性腫瘤雖然只是一字之差，但二者有根本性的差異。
start ALBERT encding
end ALBERT encoding
sentences length: 4636 
last sentence:  所以，村民進行民主選舉的心態是在這樣一種背景映託下加以表現的，這無疑給該片增添了幾分厚重的歷史文化氛圍。
start ALBERT encding
end ALBERT encoding
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
bert_output (InputLayer)     (None, 128, 312)          0         
_________________________________________________________________
bi_lstm (Bidirectional)      (None, 128, 256)          451584    
_________________________________________________________________
dropout (Dropout)            (None, 128, 256)          0         
_________________________________________________________________
time_distributed (TimeDistri (None, 128, 8)            2056      
=================================================================
Total params: 453,640
Trainable params: 453,640
Non-trainable params: 0
_________________________________________________________________
Train on 20864 samples, validate on 2318 samples
......
......
......
20864/20864 [==============================] - 97s 5ms/step - loss: 0.0091 - acc: 0.9969 - val_loss: 0.0397 - val_acc: 0.9900
           precision    recall  f1-score   support

      ORG     0.9001    0.9112    0.9056      2185
      LOC     0.9383    0.8898    0.9134      3658
      PER     0.9543    0.9415    0.9479      1864

micro avg     0.9310    0.9084    0.9196      7707
macro avg     0.9313    0.9084    0.9195      7707

在測試集上的F1值爲91.96%。同時，訓練過程當中的loss和acc曲線以下圖：

模型預測部分的代碼（腳本爲model_predict.py）以下：app

# -*- coding: utf-8 -*-
# author: Jclian91
# place: Pudong Shanghai
# time: 2020-03-11 13:16
import json
import numpy as np
from albert_zh.extract_feature import BertVector
from keras.models import load_model
from collections import defaultdict
from pprint import pprint

from utils import MAX_SEQ_LEN, event_type

# 讀取label2id字典
with open("%s_label2id.json" % event_type, "r", encoding="utf-8") as h:
    label_id_dict = json.loads(h.read())

id_label_dict = {v: k for k, v in label_id_dict.items()}

# 利用ALBERT提取文本特徵
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN)
f = lambda text: bert_model.encode([text])["encodes"][0]

# 載入模型
ner_model = load_model("%s_ner.h5" % event_type)


# 從預測的標籤列表中獲取實體
def get_entity(sent, tags_list):

    entity_dict = defaultdict(list)
    i = 0
    for char, tag in zip(sent, tags_list):
        if 'B-' in tag:
            entity = char
            j = i+1
            entity_type = tag.split('-')[-1]
            while j < min(len(sent), len(tags_list)) and 'I-%s' % entity_type in tags_list[j]:
                entity += sent[j]
                j += 1

            entity_dict[entity_type].append(entity)

        i += 1

    return dict(entity_dict)


# 輸入句子，進行預測
while 1:
    # 輸入句子
    text = input("Please enter an sentence: ").replace(' ', '')
    # 利用訓練好的模型進行預測
    train_x = np.array([f(text)])
    y = np.argmax(ner_model.predict(train_x), axis=2)
    y = [id_label_dict[_] for _ in y[0] if _]

    # 輸出預測結果
    pprint(get_entity(text, y)

隨機在網上找幾條新聞測試，結果以下：測試

Please enter an sentence: 昨天進行的女單半決賽中，陳夢4-2擊敗了隊友王曼昱，伊藤美誠則以4-0橫掃了中國選手丁寧。
{'LOC': ['中國'], 'PER': ['陳夢', '王曼昱', '伊藤美誠', '丁寧']}
Please enter an sentence: 報道還提到，德國衛生部長延斯·施潘在會上也表示，若是不能率先開發出且使用疫苗，那麼60%至70%的人可能會被感染新冠病毒。
{'ORG': ['德國衛生部'], 'PER': ['延斯·施潘']}
Please enter an sentence: 「隔離結束回來，發現公司不見了」，網上的段子，真發生在了崑山達鑫電子有限公司員工身上。
{'ORG': ['崑山達鑫電子有限公司']}
Please enter an sentence: 真人版的《花木蘭》由新西蘭導演妮基·卡羅執導，由劉亦菲、甄子丹、鄭佩佩、鞏俐、李連杰等加盟，幾乎是全亞洲整容。
{'LOC': ['新西蘭', '亞洲'], 'PER': ['妮基·卡羅', '劉亦菲', '甄子丹', '鄭佩佩', '鞏俐', '李連杰']}ui

本項目已經開源，Github網址爲：https://github.com/percent4/ALBERT_NER_KERAS 。
本文到此結束，感謝你們閱讀，歡迎關注筆者的微信公衆號：Python爬蟲與算法。編碼

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。