1 大綱概述html
文本分類這個系列將會有十篇左右,包括基於word2vec預訓練的文本分類,與及基於最新的預訓練模型(ELMo,BERT等)的文本分類。總共有如下系列:git
word2vec預訓練詞向量github
textCNN 模型json
charCNN 模型session
Bi-LSTM 模型app
RCNN 模型ide
Transformer 模型post
全部代碼均在textClassifier倉庫中。
2 數據集
數據集爲IMDB 電影影評,總共有三個數據文件,在/data/rawData目錄下,包括unlabeledTrainData.tsv,labeledTrainData.tsv,testData.tsv。在進行文本分類時須要有標籤的數據(labeledTrainData),數據預處理如文本分類實戰(一)—— word2vec預訓練詞向量中同樣,預處理後的文件爲/data/preprocess/labeledTrain.csv。
3 Adversarial LSTM模型
RCNN模型來源於論文Adversarial Training Methods For Semi-Supervised Text Classification。其模型結構以下右圖所示:
上圖中左邊爲正常的LSTM結構,右圖爲Adversarial LSTM結構,能夠看出在輸出時加上了噪聲。
Adversarial LSTM的核心思想是經過對word Embedding上添加噪音生成對抗樣本,將對抗樣本以和原始樣本 一樣的形式餵給模型,獲得一個Adversarial Loss,經過和原始樣本的loss相加獲得新的損失,經過優化該新 的損失來訓練模型,做者認爲這種方法能對word embedding加上正則化,避免過擬合。
4 參數配置
import os import csv import time import datetime import random import json import threading import warnings from collections import Counter from math import sqrt import gensim import pandas as pd import numpy as np import tensorflow as tf from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score warnings.filterwarnings("ignore")
# 配置參數 class TrainingConfig(object): epoches = 5 evaluateEvery = 100 checkpointEvery = 100 learningRate = 0.001 class ModelConfig(object): embeddingSize = 200 hiddenSizes = 128 # LSTM結構的神經元個數 dropoutKeepProb = 0.5 l2RegLambda = 0.0 epsilon = 5 class Config(object): sequenceLength = 200 # 取了全部序列長度的均值 batchSize = 128 dataSource = "../data/preProcess/labeledTrain.csv" stopWordSource = "../data/english" numClasses = 1 # 二分類設置爲1,多分類設置爲類別的數目 rate = 0.8 # 訓練集的比例 training = TrainingConfig() model = ModelConfig() # 實例化配置參數對象 config = Config()
5 生成訓練數據
1)將數據加載進來,將句子分割成詞表示,並去除低頻詞和停用詞。
2)將詞映射成索引表示,構建詞彙-索引映射表,並保存成json的數據格式,以後作inference時能夠用到。(注意,有的詞可能不在word2vec的預訓練詞向量中,這種詞直接用UNK表示)
3)從預訓練的詞向量模型中讀取出詞向量,做爲初始化值輸入到模型中。
4)將數據集分割成訓練集和測試集
# 數據預處理的類,生成訓練集和測試集 class Dataset(object): def __init__(self, config): self.config = config self._dataSource = config.dataSource self._stopWordSource = config.stopWordSource self._sequenceLength = config.sequenceLength # 每條輸入的序列處理爲定長 self._embeddingSize = config.model.embeddingSize self._batchSize = config.batchSize self._rate = config.rate self._stopWordDict = {} self.trainReviews = [] self.trainLabels = [] self.evalReviews = [] self.evalLabels = [] self.wordEmbedding =None self.indexFreqs = [] # 統計詞空間中的詞在出如今多少個review中 self.labelList = [] def _readData(self, filePath): """ 從csv文件中讀取數據集 """ df = pd.read_csv(filePath) if self.config.numClasses == 1: labels = df["sentiment"].tolist() elif self.config.numClasses > 1: labels = df["rate"].tolist() review = df["review"].tolist() reviews = [line.strip().split() for line in review] return reviews, labels def _labelToIndex(self, labels, label2idx): """ 將標籤轉換成索引表示 """ labelIds = [label2idx[label] for label in labels] return labelIds def _wordToIndex(self, reviews, word2idx): """ 將詞轉換成索引 """ reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews] return reviewIds def _genTrainEvalData(self, x, y, word2idx, rate): """ 生成訓練集和驗證集 """ reviews = [] for review in x: if len(review) >= self._sequenceLength: reviews.append(review[:self._sequenceLength]) else: reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review))) trainIndex = int(len(x) * rate) trainReviews = np.asarray(reviews[:trainIndex], dtype="int64") trainLabels = np.array(y[:trainIndex], dtype="float32") evalReviews = np.asarray(reviews[trainIndex:], dtype="int64") evalLabels = np.array(y[trainIndex:], dtype="float32") return trainReviews, trainLabels, evalReviews, evalLabels def _genVocabulary(self, reviews, labels): """ 生成詞向量和詞彙-索引映射字典,能夠用全數據集 """ allWords = [word for review in reviews for word in review] # 去掉停用詞 subWords = [word for word in allWords if word not in self.stopWordDict] wordCount = Counter(subWords) # 統計詞頻 sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True) # 去除低頻詞 words = [item[0] for item in sortWordCount if item[1] >= 5] vocab, wordEmbedding = self._getWordEmbedding(words) self.wordEmbedding = wordEmbedding word2idx = dict(zip(vocab, list(range(len(vocab))))) # 獲得逆詞頻 self._getWordIndexFreq(vocab, reviews, word2idx) uniqueLabel = list(set(labels)) label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel))))) self.labelList = list(range(len(uniqueLabel))) # 將詞彙-索引映射表保存爲json數據,以後作inference時直接加載來處理數據 with open("../data/wordJson/word2idx.json", "w", encoding="utf-8") as f: json.dump(word2idx, f) with open("../data/wordJson/label2idx.json", "w", encoding="utf-8") as f: json.dump(label2idx, f) return word2idx, label2idx def _getWordEmbedding(self, words): """ 按照咱們的數據集中的單詞取出預訓練好的word2vec中的詞向量 """ wordVec = gensim.models.KeyedVectors.load_word2vec_format("../word2vec/word2Vec.bin", binary=True) vocab = [] wordEmbedding = [] # 添加 "pad" 和 "UNK", vocab.append("PAD") vocab.append("UNK") wordEmbedding.append(np.zeros(self._embeddingSize)) wordEmbedding.append(np.random.randn(self._embeddingSize)) for word in words: try: vector = wordVec.wv[word] vocab.append(word) wordEmbedding.append(vector) except: print(word + "不存在於詞向量中") return vocab, np.array(wordEmbedding) def _getWordIndexFreq(self, vocab, reviews, word2idx): """ 統計詞彙空間中各個詞出如今多少個文本中 """ reviewDicts = [dict(zip(review, range(len(review)))) for review in reviews] indexFreqs = [0] * len(vocab) for word in vocab: count = 0 for review in reviewDicts: if word in review: count += 1 indexFreqs[word2idx[word]] = count self.indexFreqs = indexFreqs def _readStopWord(self, stopWordPath): """ 讀取停用詞 """ with open(stopWordPath, "r") as f: stopWords = f.read() stopWordList = stopWords.splitlines() # 將停用詞用列表的形式生成,以後查找停用詞時會比較快 self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList))))) def dataGen(self): """ 初始化訓練集和驗證集 """ # 初始化停用詞 self._readStopWord(self._stopWordSource) # 初始化數據集 reviews, labels = self._readData(self._dataSource) # 初始化詞彙-索引映射表和詞向量矩陣 word2idx, label2idx = self._genVocabulary(reviews, labels) # 將標籤和句子數值化 labelIds = self._labelToIndex(labels, label2idx) reviewIds = self._wordToIndex(reviews, word2idx) # 初始化訓練集和測試集 trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviewIds, labelIds, word2idx, self._rate) self.trainReviews = trainReviews self.trainLabels = trainLabels self.evalReviews = evalReviews self.evalLabels = evalLabels data = Dataset(config) data.dataGen()
6 生成batch數據集
採用生成器的形式向模型輸入batch數據集,(生成器能夠避免將全部的數據加入到內存中)
# 輸出batch數據集 def nextBatch(x, y, batchSize): """ 生成batch數據集,用生成器的方式輸出 """ perm = np.arange(len(x)) np.random.shuffle(perm) x = x[perm] y = y[perm] numBatches = len(x) // batchSize for i in range(numBatches): start = i * batchSize end = start + batchSize batchX = np.array(x[start: end], dtype="int64") batchY = np.array(y[start: end], dtype="float32") yield batchX, batchY
7 Adversarial LSTM模型
# 構建模型 class AdversarialLSTM(object): """ Text CNN 用於文本分類 """ def __init__(self, config, wordEmbedding, indexFreqs): # 定義模型的輸入 self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX") self.inputY = tf.placeholder(tf.int32, [None], name="inputY") self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb") self.config = config # 根據詞的頻率計算權重 indexFreqs[0], indexFreqs[1] = 20000, 10000 weights = tf.cast(tf.reshape(indexFreqs / tf.reduce_sum(indexFreqs), [1, len(indexFreqs)]), dtype=tf.float32) # 詞嵌入層 with tf.name_scope("embedding"): # 利用詞頻計算新的詞嵌入矩陣 normWordEmbedding = self._normalize(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), weights) # 利用詞嵌入矩陣將輸入的數據中的詞轉換成詞向量,維度[batch_size, sequence_length, embedding_size] self.embeddedWords = tf.nn.embedding_lookup(normWordEmbedding, self.inputX) # 計算二元交叉熵損失 with tf.name_scope("loss"): with tf.variable_scope("Bi-LSTM", reuse=None): self.logits = self._Bi_LSTMAttention(self.embeddedWords) if config.numClasses == 1: self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.float32, name="predictions") losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), dtype=tf.float32)) elif config.numClasses > 1: self.predictions = tf.argmax(self.logits, axis=-1, name="predictions") losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.inputY) loss = tf.reduce_mean(losses) with tf.name_scope("perturLoss"): with tf.variable_scope("Bi-LSTM", reuse=True): perturWordEmbedding = self._addPerturbation(self.embeddedWords, loss) perturPredictions = self._Bi_LSTMAttention(perturWordEmbedding) perturLosses = tf.nn.sigmoid_cross_entropy_with_logits(logits=perturPredictions, labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), dtype=tf.float32)) perturLoss = tf.reduce_mean(perturLosses) self.loss = loss + perturLoss def _Bi_LSTMAttention(self, embeddedWords): """ Bi-LSTM + Attention 的模型結構 """ config = self.config # 定義雙向LSTM的模型結構 with tf.name_scope("Bi-LSTM"): # 定義前向LSTM結構 lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=config.model.hiddenSizes, state_is_tuple=True), output_keep_prob=self.dropoutKeepProb) # 定義反向LSTM結構 lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=config.model.hiddenSizes, state_is_tuple=True), output_keep_prob=self.dropoutKeepProb) # 採用動態rnn,能夠動態的輸入序列的長度,若沒有輸入,則取序列的全長 # outputs是一個元祖(output_fw, output_bw),其中兩個元素的維度都是[batch_size, max_time, hidden_size],fw和bw的hidden_size同樣 # self.current_state 是最終的狀態,二元組(state_fw, state_bw),state_fw=[batch_size, s],s是一個元祖(h, c) outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell, lstmBwCell, self.embeddedWords, dtype=tf.float32, scope="bi-lstm") # 在Bi-LSTM+Attention的論文中,將前向和後向的輸出相加 with tf.name_scope("Attention"): H = outputs[0] + outputs[1] # 獲得Attention的輸出 output = self._attention(H) outputSize = config.model.hiddenSizes # 全鏈接層的輸出 with tf.name_scope("output"): outputW = tf.get_variable( "outputW", shape=[outputSize, config.numClasses], initializer=tf.contrib.layers.xavier_initializer()) outputB= tf.Variable(tf.constant(0.1, shape=[config.numClasses]), name="outputB") predictions = tf.nn.xw_plus_b(output, outputW, outputB, name="predictions") return predictions def _attention(self, H): """ 利用Attention機制獲得句子的向量表示 """ # 得到最後一層LSTM的神經元數量 hiddenSize = config.model.hiddenSizes # 初始化一個權重向量,是可訓練的參數 W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1)) # 對Bi-LSTM的輸出用激活函數作非線性轉換 M = tf.tanh(H) # 對W和M作矩陣運算,W=[batch_size, time_step, hidden_size],計算前作維度轉換成[batch_size * time_step, hidden_size] # newM = [batch_size, time_step, 1],每個時間步的輸出由向量轉換成一個數字 newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1])) # 對newM作維度轉換成[batch_size, time_step] restoreM = tf.reshape(newM, [-1, config.sequenceLength]) # 用softmax作歸一化處理[batch_size, time_step] self.alpha = tf.nn.softmax(restoreM) # 利用求得的alpha的值對H進行加權求和,用矩陣運算直接操做 r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, config.sequenceLength, 1])) # 將三維壓縮成二維sequeezeR=[batch_size, hidden_size] sequeezeR = tf.squeeze(r) sentenceRepren = tf.tanh(sequeezeR) # 對Attention的輸出能夠作dropout處理 output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb) return output def _normalize(self, wordEmbedding, weights): """ 對word embedding 結合權重作標準化處理 """ mean = tf.matmul(weights, wordEmbedding) print(mean) powWordEmbedding = tf.pow(wordEmbedding - mean, 2.) var = tf.matmul(weights, powWordEmbedding) print(var) stddev = tf.sqrt(1e-6 + var) return (wordEmbedding - mean) / stddev def _addPerturbation(self, embedded, loss): """ 添加波動到word embedding """ grad, = tf.gradients( loss, embedded, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) grad = tf.stop_gradient(grad) perturb = self._scaleL2(grad, self.config.model.epsilon) return embedded + perturb def _scaleL2(self, x, norm_length): # shape(x) = (batch, num_timesteps, d) # Divide x by max(abs(x)) for a numerically stable L2 norm. # 2norm(x) = a * 2norm(x/a) # Scale over the full sequence, dims (1, 2) alpha = tf.reduce_max(tf.abs(x), (1, 2), keepdims=True) + 1e-12 l2_norm = alpha * tf.sqrt( tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keepdims=True) + 1e-6) x_unit = x / l2_norm return norm_length * x_unit
8 定義計算metrics的函數
""" 定義各種性能指標 """ def mean(item: list) -> float: """ 計算列表中元素的平均值 :param item: 列表對象 :return: """ res = sum(item) / len(item) if len(item) > 0 else 0 return res def accuracy(pred_y, true_y): """ 計算二類和多類的準確率 :param pred_y: 預測結果 :param true_y: 真實結果 :return: """ if isinstance(pred_y[0], list): pred_y = [item[0] for item in pred_y] corr = 0 for i in range(len(pred_y)): if pred_y[i] == true_y[i]: corr += 1 acc = corr / len(pred_y) if len(pred_y) > 0 else 0 return acc def binary_precision(pred_y, true_y, positive=1): """ 二類的精確率計算 :param pred_y: 預測結果 :param true_y: 真實結果 :param positive: 正例的索引表示 :return: """ corr = 0 pred_corr = 0 for i in range(len(pred_y)): if pred_y[i] == positive: pred_corr += 1 if pred_y[i] == true_y[i]: corr += 1 prec = corr / pred_corr if pred_corr > 0 else 0 return prec def binary_recall(pred_y, true_y, positive=1): """ 二類的召回率 :param pred_y: 預測結果 :param true_y: 真實結果 :param positive: 正例的索引表示 :return: """ corr = 0 true_corr = 0 for i in range(len(pred_y)): if true_y[i] == positive: true_corr += 1 if pred_y[i] == true_y[i]: corr += 1 rec = corr / true_corr if true_corr > 0 else 0 return rec def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): """ 二類的f beta值 :param pred_y: 預測結果 :param true_y: 真實結果 :param beta: beta值 :param positive: 正例的索引表示 :return: """ precision = binary_precision(pred_y, true_y, positive) recall = binary_recall(pred_y, true_y, positive) try: f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) except: f_b = 0 return f_b def multi_precision(pred_y, true_y, labels): """ 多類的精確率 :param pred_y: 預測結果 :param true_y: 真實結果 :param labels: 標籤列表 :return: """ if isinstance(pred_y[0], list): pred_y = [item[0] for item in pred_y] precisions = [binary_precision(pred_y, true_y, label) for label in labels] prec = mean(precisions) return prec def multi_recall(pred_y, true_y, labels): """ 多類的召回率 :param pred_y: 預測結果 :param true_y: 真實結果 :param labels: 標籤列表 :return: """ if isinstance(pred_y[0], list): pred_y = [item[0] for item in pred_y] recalls = [binary_recall(pred_y, true_y, label) for label in labels] rec = mean(recalls) return rec def multi_f_beta(pred_y, true_y, labels, beta=1.0): """ 多類的f beta值 :param pred_y: 預測結果 :param true_y: 真實結果 :param labels: 標籤列表 :param beta: beta值 :return: """ if isinstance(pred_y[0], list): pred_y = [item[0] for item in pred_y] f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] f_beta = mean(f_betas) return f_beta def get_binary_metrics(pred_y, true_y, f_beta=1.0): """ 獲得二分類的性能指標 :param pred_y: :param true_y: :param f_beta: :return: """ acc = accuracy(pred_y, true_y) recall = binary_recall(pred_y, true_y) precision = binary_precision(pred_y, true_y) f_beta = binary_f_beta(pred_y, true_y, f_beta) return acc, recall, precision, f_beta def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): """ 獲得多分類的性能指標 :param pred_y: :param true_y: :param labels: :param f_beta: :return: """ acc = accuracy(pred_y, true_y) recall = multi_recall(pred_y, true_y, labels) precision = multi_precision(pred_y, true_y, labels) f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) return acc, recall, precision, f_beta
9 訓練模型
在訓練時,咱們定義了tensorBoard的輸出,並定義了兩種模型保存的方法。
# 訓練模型 # 生成訓練集和驗證集 trainReviews = data.trainReviews trainLabels = data.trainLabels evalReviews = data.evalReviews evalLabels = data.evalLabels wordEmbedding = data.wordEmbedding indexFreqs = data.indexFreqs labelList = data.labelList # 定義計算圖 with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allow_growth=True session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 # 配置gpu佔用率 sess = tf.Session(config=session_conf) # 定義會話 with sess.as_default(): lstm = AdversarialLSTM(config, wordEmbedding, indexFreqs) globalStep = tf.Variable(0, name="globalStep", trainable=False) # 定義優化函數,傳入學習速率參數 optimizer = tf.train.AdamOptimizer(config.training.learningRate) # 計算梯度,獲得梯度和變量 gradsAndVars = optimizer.compute_gradients(lstm.loss) # 將梯度應用到變量下,生成訓練器 trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep) # 用summary繪製tensorBoard gradSummaries = [] for g, v in gradsAndVars: if g is not None: tf.summary.histogram("{}/grad/hist".format(v.name), g) tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) outDir = os.path.abspath(os.path.join(os.path.curdir, "summarys")) print("Writing to {}\n".format(outDir)) lossSummary = tf.summary.scalar("loss", lstm.loss) summaryOp = tf.summary.merge_all() trainSummaryDir = os.path.join(outDir, "train") trainSummaryWriter = tf.summary.FileWriter(trainSummaryDir, sess.graph) evalSummaryDir = os.path.join(outDir, "eval") evalSummaryWriter = tf.summary.FileWriter(evalSummaryDir, sess.graph) # 初始化全部變量 saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # 保存模型的一種方式,保存爲pb文件 savedModelPath = "../model/adversarialLSTM/savedModel" if os.path.exists(savedModelPath): os.rmdir(savedModelPath) builder = tf.saved_model.builder.SavedModelBuilder(savedModelPath) sess.run(tf.global_variables_initializer()) def trainStep(batchX, batchY): """ 訓練函數 """ feed_dict = { lstm.inputX: batchX, lstm.inputY: batchY, lstm.dropoutKeepProb: config.model.dropoutKeepProb } _, summary, step, loss, predictions = sess.run( [trainOp, summaryOp, globalStep, lstm.loss, lstm.predictions], feed_dict) if config.numClasses == 1: acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY) elif config.numClasses > 1: acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList) trainSummaryWriter.add_summary(summary, step) return loss, acc, prec, recall, f_beta def devStep(batchX, batchY): """ 驗證函數 """ feed_dict = { lstm.inputX: batchX, lstm.inputY: batchY, lstm.dropoutKeepProb: 1.0 } summary, step, loss, predictions = sess.run( [summaryOp, globalStep, lstm.loss, lstm.predictions], feed_dict) if config.numClasses == 1: acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY) elif config.numClasses > 1: acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList) trainSummaryWriter.add_summary(summary, step) return loss, acc, prec, recall, f_beta for i in range(config.training.epoches): # 訓練模型 print("start training model") for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize): loss, acc, prec, recall, f_beta = trainStep(batchTrain[0], batchTrain[1]) currentStep = tf.train.global_step(sess, globalStep) print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format( currentStep, loss, acc, recall, prec, f_beta)) if currentStep % config.training.evaluateEvery == 0: print("\nEvaluation:") losses = [] accs = [] f_betas = [] precisions = [] recalls = [] for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize): loss, acc, precision, recall, f_beta = devStep(batchEval[0], batchEval[1]) losses.append(loss) accs.append(acc) f_betas.append(f_beta) precisions.append(precision) recalls.append(recall) time_str = datetime.datetime.now().isoformat() print("{}, step: {}, loss: {}, acc: {},precision: {}, recall: {}, f_beta: {}".format(time_str, currentStep, mean(losses), mean(accs), mean(precisions), mean(recalls), mean(f_betas))) if currentStep % config.training.checkpointEvery == 0: # 保存模型的另外一種方法,保存checkpoint文件 path = saver.save(sess, "../model/adversarialLSTM/model/my-model", global_step=currentStep) print("Saved model checkpoint to {}\n".format(path)) inputs = {"inputX": tf.saved_model.utils.build_tensor_info(lstm.inputX), "keepProb": tf.saved_model.utils.build_tensor_info(lstm.dropoutKeepProb)} outputs = {"predictions": tf.saved_model.utils.build_tensor_info(lstm.binaryPreds)} prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, outputs=outputs, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op") builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={"predict": prediction_signature}, legacy_init_op=legacy_init_op) builder.save()
10 預測代碼
x = "this movie is full of references like mad max ii the wild one and many others the ladybug´s face it´s a clear reference or tribute to peter lorre this movie is a masterpiece we´ll talk much more about in the future" # 注:下面兩個詞典要保證和當前加載的模型對應的詞典是一致的 with open("../data/wordJson/word2idx.json", "r", encoding="utf-8") as f: word2idx = json.load(f) with open("../data/wordJson/label2idx.json", "r", encoding="utf-8") as f: label2idx = json.load(f) idx2label = {value: key for key, value in label2idx.items()} xIds = [word2idx.get(item, word2idx["UNK"]) for item in x.split(" ")] if len(xIds) >= config.sequenceLength: xIds = xIds[:config.sequenceLength] else: xIds = xIds + [word2idx["PAD"]] * (config.sequenceLength - len(xIds)) graph = tf.Graph() with graph.as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options) sess = tf.Session(config=session_conf) with sess.as_default(): checkpoint_file = tf.train.latest_checkpoint("../model/adversarialLSTM/model/") saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # 得到須要餵給模型的參數,輸出的結果依賴的輸入值 inputX = graph.get_operation_by_name("inputX").outputs[0] dropoutKeepProb = graph.get_operation_by_name("dropoutKeepProb").outputs[0] # 得到輸出的結果 predictions = graph.get_tensor_by_name("output/predictions:0") pred = sess.run(predictions, feed_dict={inputX: [xIds], dropoutKeepProb: 1.0})[0] pred = [idx2label[item] for item in pred] print(pred)