本文主要參考:https://github.com/zhedongzheng/finch 完成。與原代碼的區別在於沒有使用 tf.estimator,以及數據預處理方面作了部分修改(使用於dataset)python
# -*- coding:utf-8 -*- from collections import Counter import tensorflow as tf import numpy as np import re PARAMS = { 'min_freq': 5, 'window_size': 3, 'n_sampled': 100, 'embed_dim': 200, } def preprocess_text(text): # 1. 將數據中的換行符替換爲空格 text = text.replace('\n', ' ') # 2. 將數據中的空白字符替換爲空格,並轉化爲全小寫 text = re.sub('\s+', ' ', text).strip().lower() # 3. 以空格爲分隔符,相似於簡單的分詞 words = text.split() # 4. 統計詞頻 word2freq = Counter(words) # 5. 去掉低頻詞 words = [word for word in words if word2freq[word] > PARAMS['min_freq']] print("Total words:", len(words)) # 6. 去重 _words = set(words) PARAMS['word2idx'] = {c: i for i, c in enumerate(_words)} PARAMS['idx2word'] = {i: c for i, c in enumerate(_words)} PARAMS['vocab_size'] = len(PARAMS['idx2word']) print('Vocabulary size:', PARAMS['vocab_size']) indexed = [PARAMS['word2idx'][w] for w in words] # 7. 將高頻詞去掉 indexed = filter_high_freq(indexed) print("Word preprocessing completed ...") return indexed def filter_high_freq(int_words, t=1e-5, threshold=0.8): int_word_counts = Counter(int_words) total_count = len(int_words) # 1. 計算詞的機率,c/all word_freqs = {w: c / total_count for w, c in int_word_counts.items()} # 2. 計算詞的丟棄機率,詞頻越高,丟棄機率越高。例如: 'the' 出現詞頻很高,但攜帶的信息少,須要去除 prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts} # 3. 高於閾值的丟棄掉 train_words = [w for w in int_words if prob_drop[w] < threshold] return train_words def make_data(int_words): x, y = [], [] for i in range(PARAMS['window_size'], len(int_words) - PARAMS['window_size']): # 1. 生成一個詞的上下文 inputs = get_x(int_words, i) # 2. 將一個詞的全部上下文做爲一個總體,添加到x中 # x = [['a','b','d',e'],['b','c','e','f'] x.append(inputs) # 3. 將每一個label做爲一個子list,添加到y中 #y = [['c'],['d']] # 4. 即每條數據爲context:word y.append([int_words[i]]) return np.array(x), np.array(y) def get_x(words, idx): left = idx - PARAMS['window_size'] right = idx + PARAMS['window_size'] return words[left: idx] + words[idx + 1: right + 1] # 1. 預處理數據 with open(r'E:\nlp_data\ptb_train.txt') as f: x_train, y_train = make_data(preprocess_text(f.read())) # 2. 將數據封裝爲dataset # 這裏一條數據是多少呢? # 一條數據,x = 6個詞,y=1個詞,x_train[i],y_train[i] # 緣由在於 make_data 中的 x.append(inputs) 和 y.append([int_words[i]]) dataset = tf.data.Dataset.from_tensor_slices(tensors=(x_train,y_train)) dataset = dataset.batch(batch_size=100).repeat(5) iter = dataset.make_one_shot_iterator() next_data = iter.get_next() # 3. CBOW模型搭建 # 這裏填寫 shape = (None,6) 和 shape=(None,1) # 緣由在於 make_data 中的 x.append(inputs) 和 y.append([int_words[i]]) # window_size = 3,則context大小爲6 # None 爲100,緣由在於 dataset.batch(batch_size=100) x = tf.placeholder(shape=(None,6),dtype=tf.int32) y_= tf.placeholder(shape=(None,1),dtype=tf.int32) E = tf.get_variable(name="E",shape=(PARAMS['vocab_size'],PARAMS['embed_dim'])) embedding = tf.nn.embedding_lookup(params=E,ids=x) embedding = tf.reduce_mean(embedding,axis=[1]) W = tf.get_variable(name="w",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']),dtype=tf.float32) b = tf.get_variable(name="b",shape=(PARAMS['vocab_size']),dtype=tf.float32) loss_op = tf.reduce_mean(tf.nn.sampled_softmax_loss( weights=W, biases=b, labels=y_, inputs=embedding, num_sampled=PARAMS['n_sampled'], num_classes=PARAMS['vocab_size'])) opt = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss=loss_op) init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) try: while True: inputs,labels = session.run(next_data) session.run(fetches=opt,feed_dict={x:inputs,y_:labels}) except tf.errors.OutOfRangeError: print("train complete")