該章的源代碼已經調通,以下, 先記錄下來,再慢慢理解python
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import pickle import keras from keras.models import Sequential, Model from keras.layers import Input, Dense, Activation, Dropout, Embedding, Reshape, Dot, Concatenate, Multiply from keras.layers import LSTM from keras.optimizers import RMSprop from keras.utils.data_utils import get_file from keras.preprocessing.sequence import pad_sequences from keras.models import model_from_json import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler plt.rcParams['figure.figsize']=(20, 10) # 讀入數據 # In[2]:文件下載地址:http://dataset.cs.mcgill.ca/ubuntu-corpus-1.0/ubuntu_blobs.tgz with open("dataset.pkl", "rb") as f: data = pickle.load(f) # In[3]: print("size ======= %s" % len(data)) # In[4]: import gc gc.collect() # 看看數據裏都是什麼 # In[5]: for j in range(len(data)): print("======= %s" % j) for i, k in enumerate(data[j]): print(k) # 這裏分析最長的句子的長度 # In[6]: # 這裏分析最長的句子的長度 length=map(len, data[0]['c']) res=list(length) context_length=np.max(res[:]) print(context_length) length=map(len, data[0]['r']) res=list(length) response_length=np.max(res[:]) print(response_length) # 這裏分析整個詞典的大小 # In[7]: context_size = np.max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['c']))) print(context_size) response_size = max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['r']))) print(response_size) # In[8]: max(data[0]['r'][1]) # In[9]: embedding_dim=64 lstm_dim=64 context_length=np.max(list(map(len, data[0]['c']))) #print(context_length) response_length=np.max(list( map(len, data[0]['r']))) #print(response_length) Y = data[0]['r'] print('Begin Modeling...') context_size = np.max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['c']))) response_size = max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['r']))) volcabulary_size=max(context_size, response_size) context_length=120 # 對上下文部分進行嵌入和建模 context=Input(shape=((context_length,)), name='context_input') context_embedded=Embedding(input_length=context_length, output_dim=embedding_dim, input_dim=volcabulary_size)(context) context_lstm=LSTM(lstm_dim)(context_embedded) # 對迴應部分進行嵌入和建模 response_length=120 response=Input(shape=((response_length,)), name='response_input') response_embedded=Embedding(input_length=response_length, output_dim=embedding_dim, input_dim=volcabulary_size)(response) response_lstm=LSTM(lstm_dim)(response_embedded) #print(response_lstm.outputs) x = Dot([1, 1])([context_lstm, response_lstm]) #x = Multiply()([context_lstm, response_lstm]) yhat = Dense(2, activation='softmax')(x) model = Model(inputs=[context, response], outputs=yhat) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) print('Finish compiling...') model.summary() # In[10]: # 針對該數據定製的generator。通常應該將三個部分分離之後再編制generator def data_gen(data, batch_size=100): contextRaw = data['c'] responseRaw = data['r'] yRaw = data['y'] number_of_batches = len(contextRaw) // batch_size counter=0 context_length=np.max(list(map(len, contextRaw)))//3 response_length=np.max(list( map(len, responseRaw)))//3 context_length=120 response_length=120 while 1: lowerBound = batch_size*counter upperBound = batch_size*(counter+1) Ctemp = contextRaw[lowerBound : upperBound] C_batch = pad_sequences(Ctemp, maxlen=context_length, padding='post') C_res = np.zeros((batch_size, context_length), dtype=np.int) Rtemp = responseRaw[lowerBound : upperBound] R_batch = pad_sequences(Rtemp, maxlen=response_length, padding='post') R_res = np.zeros((batch_size, response_length), dtype=np.int) for k in np.arange(batch_size): C_res[k, :] = C_batch[k, :] R_res[k, :] = R_batch[k, :] y_res= keras.utils.to_categorical(yRaw[lowerBound : upperBound]) counter += 1 yield([C_res.astype('float32'), R_res.astype('float32')], y_res.astype('float32')) if (counter < number_of_batches): counter=0 # 下面訓練這個模型。在6GB顯存的GTX 1060上,小批量的大小不能超過200。讀者有時間能夠試試屢次迭代,看看效果。 # In[11]: #Y = keras.utils.to_categorical(data[0]['y'], num_classes=2) batch_size=168 model.fit_generator(data_gen(data[0], batch_size=batch_size), steps_per_epoch=len(data[0]['c'])//batch_size, validation_data = data_gen(data[1]), validation_steps = len(data[1]['c'])//batch_size, epochs=1) # 下面咱們將模型存入磁盤。咱們也能夠在擬合過程當中使用checkponit選項將每一步的結果都分別存入一個磁盤文件中。 # In[12]: # 將模型結構存爲JSON格式 model_json = model.to_json() with open("dual_lstm_model.json", "w") as json_file: json_file.write(model_json) # 將模型擬合獲得的權重存入HDF5文件中 model.save_weights("dual_lstm_model.h5") print("模型已經寫入磁盤") # In[13]: # 若是要調用已有模型,能夠經過以下方法 # 從磁盤載入模型結構 json_file = open('dual_lstm_model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # 從磁盤讀入模型權重 model.load_weights("dual_lstm_model.h5") print("載入模型完畢") model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) print('模型編譯完畢...') # 下面進行預測。輸入數據的組織形式應該遵循data generator裏面的數據處理和輸出組織形式,可是咱們能夠經過predict_generator方法直接引用現有的data generator,只是用在測試集,而不是訓練集上。 # In[14]: batch_size=256 ypred = model.predict_generator( data_gen(data[2], batch_size=batch_size), steps=(len(data[2]['c'])//batch_size), verbose=1) # In[15]: yTest = data[1]['y'] ypred2=(2-(ypred[:,0]>ypred[:,1]))-1 z = [str(ypred2[i])==yTest[i] for i in range(len(ypred2))] np.mean(z)