醫學詞向量訓練---詞向量訓練和可視化

醫學詞向量訓練---詞向量訓練和可視化

詞向量訓練

在這裏插入圖片描述

python代碼
import jieba
import os
import json
import codecs
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import LineSentence
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']


class medicalWord2vec:
    """
    醫學詞向量
    """

    def __init__(self):
        self.stopwords_path = '../../詞向量/word/stop.txt'  # 停用詞路徑
        self.new_cut_words_path = "../../詞向量/word/jieba.txt"  # 原先分詞詞庫路徑
        self.origin_cut_words_path = "../../詞向量/word/my_words.txt"  # 新分詞詞庫路徑
        self.data_path = '../../詞向量/data/content/'  # 語料json路徑
        self.corpus_path = "../../詞向量/word/content.txt"  # 保存的代訓練語料路徑
        self.word_vector_path = "../../詞向量/word/word2vec.vector"  # 詞向量路徑
        self.word_vector_dimension = 400  # 詞向量維度
        self.test_words = ['新冠', '全身疼痛', '感冒', '病毒感染', '肚子疼', '咳嗽', "頭疼", "頭痛"]  # 測試詞向量詞語

    def seg_sentence(self, sentence):
        """
        對語料分詞,結巴去除聽用詞分詞
        :param sentence: 句子
        :return: 分詞後list
        """
        sentence_seg = jieba.cut(sentence.strip())
        stopwords = [line.strip() for line in open(self.stopwords_path, 'r', encoding='utf-8').readlines()]
        return [word for word in sentence_seg if word not in stopwords and word != "\t"]

    def get_cut_word(self):
        """
        基於39net問答豐富分詞詞庫
        """
        print("------開始更新分詞詞庫------")
        f_data = [line.replace("\n", "") for line in open(self.origin_cut_words_path, encoding="utf8")]
        f = open(self.new_cut_words_path, "w", encoding="utf8")
        result = []
        for file_one in os.listdir(self.data_path):
            for file_two in os.listdir(self.data_path + file_one + "/"):
                with open(self.data_path + file_one + "/" + file_two, encoding='utf8') as data_json:
                    data_json = json.load(data_json)
                    result.extend([element['label'] for element in data_json['key_word'] if len(element['label']) > 1])
            print(file_one + "-----------已完成")
        result = list(set(result + f_data))
        for element in result:
            f.write(element + "\n")
        f.close()
        print("------分詞詞庫更新完成------")

    def get_corpus(self):
        """
        對語料進行分詞
        """
        print("------開始語料分詞------")
        target = codecs.open(self.corpus_path, 'w', encoding="utf8")
        jieba.load_userdict(self.new_cut_words_path)
        for file_one in os.listdir(self.data_path):
            for file_two in os.listdir(self.data_path + file_one + "/"):
                with open(self.data_path + file_one + "/" + file_two, encoding='utf8') as data_json:
                    data_json = json.load(data_json)['content']
                    line_seg = " ".join(self.seg_sentence(data_json))
                    target.writelines(line_seg)
            print(file_one + "-----------已完成")
        target.close()
        print("------語料分詞結束------")

    def train_word2vec(self):
        """
        基於word2vec訓練詞向量

        """
        print("------開始詞向量訓練------")
        model = Word2Vec(LineSentence(self.corpus_path), size=self.word_vector_dimension, window=5, min_count=5,
                         workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(self.word_vector_path, binary=True)
        print("------詞向量訓練結束------")

    def test_word2vec(self):
        """
        測試詞向量
        """
        word2vec_model = KeyedVectors.load_word2vec_format(self.word_vector_path, binary=True)
        test_words = self.test_words
        for i in range(len(test_words)):
            res = word2vec_model.most_similar(test_words[i])
            print("測試詞語:-----", test_words[i])
            print("類似詞:-----", res)

    def visualization(self):
        """
        可視化詞向量
        """
        word2vec_model = KeyedVectors.load_word2vec_format(self.word_vector_path, binary=True)
        words = list(word2vec_model.wv.vocab)
        random.shuffle(words)
        print("詞向量總數:----------", len(words))
        vector = word2vec_model[words]
        random_word2vec = TSNE(n_components=2, init='pca', verbose=1).fit_transform(vector)
        plt.figure(figsize=(14, 10))
        plt.scatter(random_word2vec[:300, 0], random_word2vec[:300, 1])
        for i in range(300):
            x = random_word2vec[i][0]
            y = random_word2vec[i][1]
            plt.text(x, y, words[i])
        plt.title('醫學詞向量可視化', size=16)
        plt.savefig('../../詞向量/word/TSNE.jpg', dpi=200)
        plt.show()

結果

訓練結果

在這裏插入圖片描述

詞向量可視化

在這裏插入圖片描述

相關文章
相關標籤/搜索