在上篇文章中咱們講了如何用支持向量機對垃圾郵件進行分類,auc爲73.3%,本篇講繼續講如何用PaddlePaddle實現郵件分類,將深度學習方法運用到文本分類中。html
用PaddlePaddle來構建網絡模型其實很簡單,首先得明確paddlepaddle的輸入數據的格式要求,知道如何構建網絡模型,以及如何訓練。關於輸入數據的預處理等能夠參考我以前寫的這篇文章【深度學習系列】PaddlePaddle之數據預處理。首先咱們先採用一個淺層的神經網絡來進行訓練。python
在PaddlePaddle中,咱們須要建立一個reador來讀取數據,在上篇文章中,咱們已經對原始數據處理好了,正負樣本分別爲ham.txt和spam.txxt,這裏咱們只須要加載數據便可。
代碼實現:bash
# 加載數據 def loadfile(): # 加載正樣本 fopen = open('ham.txt','r') pos = [] for line in fopen: pos.append(line) #加載負樣本 fopen = open('spam.txt','r') neg = [] for line in fopen: neg.append(line) combined=np.concatenate((pos, neg)) # 建立label y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int))) return combined,y # 建立paddlepaddle讀取數據的reader def reader_creator(dataset,label): def reader(): for i in xrange(len(dataset)): yield dataset[i,:],int(label[i]) return reader
建立詞語索引:網絡
#建立詞語字典,並返回每一個詞語的索引,詞向量,以及每一個句子所對應的詞語索引 def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()}#全部頻數超過10的詞語的索引 w2vec = {word: model[word] for word in w2indx.keys()}#全部頻數超過10的詞語的詞向量 def parse_dataset(combined): ''' Words become integers ''' data=[] for sentence in combined: new_txt = [] sentences = sentence.split(' ') for word in sentences: try: word = unicode(word, errors='ignore') new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined=parse_dataset(combined) combined= sequence.pad_sequences(combined, maxlen=maxlen)#每一個句子所含詞語對應的索引,因此句子中含有頻數小於10的詞語,索引爲0 return w2indx, w2vec,combined else: print 'No data provided...'
這裏咱們採起sklearn的train_test_split函數對數據集進行劃分,訓練集和驗證集的比例爲4:1。
代碼實現:app
# 導入word2vec模型 def word2vec_train(combined): model = Word2Vec.load('lstm_data/model/Word2vec_model.pkl') index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined) return index_dict, word_vectors,combined # 獲取訓練集、驗證集 def get_data(index_dict,word_vectors,combined,y): n_symbols = len(index_dict) + 1 # 全部單詞的索引數,頻數小於10的詞語索引爲0,因此加1 embedding_weights = np.zeros((n_symbols, vocab_dim))#索引爲0的詞語,詞向量全爲0 for word, index in index_dict.items():#從索引爲1的詞語開始,對每一個詞語對應其詞向量 embedding_weights[index, :] = word_vectors[word] x_train, x_val, y_train, y_val = train_test_split(combined, y, test_size=0.2) print x_train.shape,y_train.shape return n_symbols,embedding_weights,x_train,y_train,x_val,y_val
class NeuralNetwork(object): def __init__(self,X_train,Y_train,X_val,Y_val,vocab_dim,n_symbols,num_classes=2): paddle.init(use_gpu = with_gpu,trainer_count=1) self.X_train = X_train self.Y_train = Y_train self.X_val = X_val self.Y_val = Y_val self.vocab_dim = vocab_dim self.n_symbols = n_symbols self.num_classes=num_classes # 定義網絡模型 def get_network(self): # 分類模型 x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(self.vocab_dim)) y = paddle.layer.data(name='y', type=paddle.data_type.integer_value(self.num_classes)) fc1 = paddle.layer.fc(input = x,size = 1280,act = paddle.activation.Linear()) fc2 = paddle.layer.fc(input = fc1,size = 640,act = paddle.activation.Relu()) prob = paddle.layer.fc(input = fc2,size = self.num_classes,act = paddle.activation.Softmax()) predict = paddle.layer.mse_cost(input = prob,label = y) return predict # 定義訓練器 def get_trainer(self): cost = self.get_network() #獲取參數 parameters = paddle.parameters.create(cost) #定義優化方法 optimizer0 = paddle.optimizer.Momentum( momentum=0.9, regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128), learning_rate=0.01 / 128.0, learning_rate_decay_a=0.01, learning_rate_decay_b=50000 * 100) optimizer1 = paddle.optimizer.Momentum( momentum=0.9, regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128), learning_rate=0.001, learning_rate_schedule = "pass_manual", learning_rate_args = "1:1.0, 8:0.1, 13:0.01") optimizer = paddle.optimizer.Adam( learning_rate=2e-3, regularization=paddle.optimizer.L2Regularization(rate=8e-4), model_average=paddle.optimizer.ModelAverage(average_window=0.5)) # 建立訓練器 trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) return parameters,trainer # 開始訓練 def start_trainer(self,X_train,Y_train,X_val,Y_val): parameters,trainer = self.get_trainer() result_lists = [] def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: print "\nPass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) if isinstance(event, paddle.event.EndPass): # 保存訓練好的參數 with open('params_pass_%d.tar' % event.pass_id, 'w') as f: parameters.to_tar(f) # feeding = ['x','y'] result = trainer.test( reader=val_reader) # feeding=feeding) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) result_lists.append((event.pass_id, result.cost, result.metrics['classification_error_evaluator'])) # 開始訓練 train_reader = paddle.batch(paddle.reader.shuffle( reader_creator(X_train,Y_train),buf_size=20), batch_size=4) val_reader = paddle.batch(paddle.reader.shuffle( reader_creator(X_val,Y_val),buf_size=20), batch_size=4) trainer.train(reader=train_reader,num_passes=5,event_handler=event_handler) #找到訓練偏差最小的一次結果 best = sorted(result_lists, key=lambda list: float(list[1]))[0] print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1]) print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
#訓練模型,並保存 def train(): print 'Loading Data...' combined,y=loadfile() print len(combined),len(y) print 'Tokenising...' combined = tokenizer(combined) print 'Training a Word2vec model...' index_dict, word_vectors,combined=word2vec_train(combined) print 'Setting up Arrays for Keras Embedding Layer...' n_symbols,embedding_weights,x_train,y_train,x_val,y_val=get_data(index_dict, word_vectors,combined,y) print x_train.shape,y_train.shape network = NeuralNetwork(X_train = x_train,Y_train = y_train,X_val = x_val, Y_val = y_val,vocab_dim = vocab_dim,n_symbols = n_symbols,num_classes = 2) network.start_trainer(x_train,y_train,x_val,y_val) if __name__=='__main__': train()
設置迭代5次,輸出結果以下:ide
Using TensorFlow backend. Loading Data... 63000 63000 Tokenising... Building prefix dict from the default dictionary ... [DEBUG 2018-01-29 00:29:19,184 __init__.py:111] Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache [DEBUG 2018-01-29 00:29:19,185 __init__.py:131] Loading model from cache /tmp/jieba.cache Loading model cost 0.253 seconds. [DEBUG 2018-01-29 00:29:19,437 __init__.py:163] Loading model cost 0.253 seconds. Prefix dict has been built succesfully. [DEBUG 2018-01-29 00:29:19,437 __init__.py:164] Prefix dict has been built succesfully. I0128 12:29:17.325337 16772 GradientMachine.cpp:101] Init parameters done. Pass 0, Batch 0, Cost 0.519137, {'classification_error_evaluator': 0.25} Pass 0, Batch 100, Cost 0.410812, {'classification_error_evaluator': 0} Pass 0, Batch 200, Cost 0.486661, {'classification_error_evaluator': 0.25} ··· Pass 4, Batch 12200, Cost 0.508126, {'classification_error_evaluator': 0.25} Pass 4, Batch 12300, Cost 0.312028, {'classification_error_evaluator': 0.25} Pass 4, Batch 12400, Cost 0.259026, {'classification_error_evaluator': 0.0} Pass 4, Batch 12500, Cost 0.177996, {'classification_error_evaluator': 0.25} Test with Pass 4, {'classification_error_evaluator': 0.15238096714019775} Best pass is 4, testing Avgcost is 0.716855627394 The classification accuracy is 84.76%
由此能夠看到,僅迭代5次paddlepaddle的結果便可達到84.76%,若是增長迭代次數,能夠達到更高的準確率。函數
本篇文章講了如何用paddlepaddle來進行垃圾郵件分類,採起一個簡單的淺層神經網絡來訓練模型,迭代5次的準確率即爲84.76%。在實際操做過程當中,你們能夠增長迭代次數,提升模型的精度,也可採起一些其餘的方法,譬如文本CNN模型,LSTM模型來訓練以得到更好的效果。性能
本文首發於景略集智,並由景略集智製做成「PaddlePaddle調戲郵件詐騙犯」系列視頻。若是有不懂的,歡迎在評論區中提問~學習