【NLP】【十】基於tensorflow實現CBOW

時間 2019-12-07

標籤 NLP 基於 tensorflow 實現 cbow 简体版

原文原文鏈接

本文主要參考：https://github.com/zhedongzheng/finch 完成。與原代碼的區別在於沒有使用 tf.estimator，以及數據預處理方面作了部分修改（使用於dataset）python

# -*- coding:utf-8 -*-
from collections import Counter

import tensorflow as tf
import numpy as np
import re

PARAMS = {
    'min_freq': 5,
    'window_size': 3,
    'n_sampled': 100,
    'embed_dim': 200,
}


def preprocess_text(text):
    # 1. 將數據中的換行符替換爲空格
    text = text.replace('\n', ' ')
    # 2. 將數據中的空白字符替換爲空格，並轉化爲全小寫
    text = re.sub('\s+', ' ', text).strip().lower()
    # 3. 以空格爲分隔符，相似於簡單的分詞
    words = text.split()
    # 4. 統計詞頻
    word2freq = Counter(words)
    # 5. 去掉低頻詞
    words = [word for word in words if word2freq[word] > PARAMS['min_freq']]
    print("Total words:", len(words))
    # 6. 去重
    _words = set(words)
    PARAMS['word2idx'] = {c: i for i, c in enumerate(_words)}
    PARAMS['idx2word'] = {i: c for i, c in enumerate(_words)}
    PARAMS['vocab_size'] = len(PARAMS['idx2word'])
    print('Vocabulary size:', PARAMS['vocab_size'])

    indexed = [PARAMS['word2idx'][w] for w in words]
    # 7. 將高頻詞去掉
    indexed = filter_high_freq(indexed)
    print("Word preprocessing completed ...")
    return indexed

def filter_high_freq(int_words, t=1e-5, threshold=0.8):
    int_word_counts = Counter(int_words)
    total_count = len(int_words)
    # 1. 計算詞的機率，c/all
    word_freqs = {w: c / total_count for w, c in int_word_counts.items()}
    # 2. 計算詞的丟棄機率，詞頻越高，丟棄機率越高。例如: 'the' 出現詞頻很高，但攜帶的信息少,須要去除
    prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
    # 3. 高於閾值的丟棄掉
    train_words = [w for w in int_words if prob_drop[w] < threshold]
    return train_words


def make_data(int_words):
    x, y = [], []
    for i in range(PARAMS['window_size'], len(int_words) - PARAMS['window_size']):
        # 1. 生成一個詞的上下文
        inputs = get_x(int_words, i)
        # 2. 將一個詞的全部上下文做爲一個總體，添加到x中
        # x = [['a','b','d',e'],['b','c','e','f']
        x.append(inputs)
        # 3. 將每一個label做爲一個子list，添加到y中
        #y = [['c'],['d']]
        # 4. 即每條數據爲context:word
        y.append([int_words[i]])
    return np.array(x), np.array(y)


def get_x(words, idx):
    left = idx - PARAMS['window_size']
    right = idx + PARAMS['window_size']
    return words[left: idx] + words[idx + 1: right + 1]

# 1. 預處理數據
with open(r'E:\nlp_data\ptb_train.txt') as f:
    x_train, y_train = make_data(preprocess_text(f.read()))
# 2. 將數據封裝爲dataset
# 這裏一條數據是多少呢？
# 一條數據，x = 6個詞，y=1個詞，x_train[i],y_train[i]
# 緣由在於 make_data 中的  x.append(inputs) 和 y.append([int_words[i]])
dataset = tf.data.Dataset.from_tensor_slices(tensors=(x_train,y_train))
dataset = dataset.batch(batch_size=100).repeat(5)
iter = dataset.make_one_shot_iterator()
next_data = iter.get_next()
# 3. CBOW模型搭建
# 這裏填寫 shape = (None,6) 和 shape=(None,1)
# 緣由在於 make_data 中的  x.append(inputs) 和 y.append([int_words[i]])
# window_size = 3，則context大小爲6
# None 爲100，緣由在於  dataset.batch(batch_size=100)
x = tf.placeholder(shape=(None,6),dtype=tf.int32)
y_= tf.placeholder(shape=(None,1),dtype=tf.int32)

E = tf.get_variable(name="E",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']))
embedding = tf.nn.embedding_lookup(params=E,ids=x)
embedding = tf.reduce_mean(embedding,axis=[1])

W = tf.get_variable(name="w",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']),dtype=tf.float32)
b = tf.get_variable(name="b",shape=(PARAMS['vocab_size']),dtype=tf.float32)

loss_op = tf.reduce_mean(tf.nn.sampled_softmax_loss(
    weights=W,
    biases=b,
    labels=y_,
    inputs=embedding,
    num_sampled=PARAMS['n_sampled'],
    num_classes=PARAMS['vocab_size']))

opt = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss=loss_op)

init = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(init)
    try:
        while True:
            inputs,labels = session.run(next_data)
            session.run(fetches=opt,feed_dict={x:inputs,y_:labels})
    except tf.errors.OutOfRangeError:
        print("train complete")

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。