用tensorflow遷移學習貓狗分類

時間 2019-12-06

標籤 tensorflow 遷移學習分類简体版

原文原文鏈接

筆者這幾天在跟着莫煩學習TensorFlow，正好到遷移學習（至於什麼是遷移學習，看這篇），莫煩老師作的是預測貓和老虎尺寸大小的學習。做爲一個有爲的學生，筆者固然不能再預測貓啊狗啊的大小啦，正好以前正好有作過貓狗大戰數據集的圖像分類，作好的數據都還在，二話不說，開擼。python

既然是VGG16模型，固然首先上模型代碼了：git

  1 def conv_layers_simple_api(net_in):
  2     with tf.name_scope('preprocess'):
  3         # Notice that we include a preprocessing layer that takes the RGB image
  4         # with pixels values in the range of 0-255 and subtracts the mean image
  5         # values (calculated over the entire ImageNet training set).
  6         mean = tf.constant([123.68, 116.779, 103.939], dtype=tf.float32, shape=[1, 1, 1, 3], name='img_mean')
  7         net_in.outputs = net_in.outputs - mean
  8 
  9     # conv1
 10     network = Conv2d(net_in, n_filter=64, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 11                      name='conv1_1')
 12     network = Conv2d(network, n_filter=64, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 13                      name='conv1_2')
 14     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool1')
 15 
 16     # conv2
 17     network = Conv2d(network, n_filter=128, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 18                      name='conv2_1')
 19     network = Conv2d(network, n_filter=128, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 20                      name='conv2_2')
 21     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool2')
 22 
 23     # conv3
 24     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 25                      name='conv3_1')
 26     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 27                      name='conv3_2')
 28     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 29                      name='conv3_3')
 30     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool3')
 31 
 32     # conv4
 33     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 34                      name='conv4_1')
 35     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 36                      name='conv4_2')
 37     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 38                      name='conv4_3')
 39     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool4')
 40 
 41     # conv5
 42     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 43                      name='conv5_1')
 44     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 45                      name='conv5_2')
 46     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 47                      name='conv5_3')
 48     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool5')
 49     return network``
 50 def conv_layers_simple_api(net_in):
 51     with tf.name_scope('preprocess'):
 52         # Notice that we include a preprocessing layer that takes the RGB image
 53         # with pixels values in the range of 0-255 and subtracts the mean image
 54         # values (calculated over the entire ImageNet training set).
 55         mean = tf.constant([123.68, 116.779, 103.939], dtype=tf.float32, shape=[1, 1, 1, 3], name='img_mean')
 56         net_in.outputs = net_in.outputs - mean
 57 
 58     # conv1
 59     network = Conv2d(net_in, n_filter=64, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 60                      name='conv1_1')
 61     network = Conv2d(network, n_filter=64, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 62                      name='conv1_2')
 63     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool1')
 64 
 65     # conv2
 66     network = Conv2d(network, n_filter=128, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 67                      name='conv2_1')
 68     network = Conv2d(network, n_filter=128, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 69                      name='conv2_2')
 70     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool2')
 71 
 72     # conv3
 73     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 74                      name='conv3_1')
 75     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 76                      name='conv3_2')
 77     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 78                      name='conv3_3')
 79     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool3')
 80 
 81     # conv4
 82     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 83                      name='conv4_1')
 84     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 85                      name='conv4_2')
 86     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 87                      name='conv4_3')
 88     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool4')
 89 
 90     # conv5
 91     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 92                      name='conv5_1')
 93     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 94                      name='conv5_2')
 95     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
 96                      name='conv5_3')
 97     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool5')
 98     return network``
 99 def conv_layers_simple_api(net_in):
100     with tf.name_scope('preprocess'):
101         # Notice that we include a preprocessing layer that takes the RGB image
102         # with pixels values in the range of 0-255 and subtracts the mean image
103         # values (calculated over the entire ImageNet training set).
104         mean = tf.constant([123.68, 116.779, 103.939], dtype=tf.float32, shape=[1, 1, 1, 3], name='img_mean')
105         net_in.outputs = net_in.outputs - mean
106 
107     # conv1
108     network = Conv2d(net_in, n_filter=64, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
109                      name='conv1_1')
110     network = Conv2d(network, n_filter=64, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
111                      name='conv1_2')
112     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool1')
113 
114     # conv2
115     network = Conv2d(network, n_filter=128, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
116                      name='conv2_1')
117     network = Conv2d(network, n_filter=128, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
118                      name='conv2_2')
119     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool2')
120 
121     # conv3
122     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
123                      name='conv3_1')
124     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
125                      name='conv3_2')
126     network = Conv2d(network, n_filter=256, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
127                      name='conv3_3')
128     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool3')
129 
130     # conv4
131     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
132                      name='conv4_1')
133     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
134                      name='conv4_2')
135     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
136                      name='conv4_3')
137     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool4')
138 
139     # conv5
140     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
141                      name='conv5_1')
142     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
143                      name='conv5_2')
144     network = Conv2d(network, n_filter=512, filter_size=(3, 3), strides=(1, 1), act=tf.nn.relu, padding='SAME',
145                      name='conv5_3')
146     network = MaxPool2d(network, filter_size=(2, 2), strides=(2, 2), padding='SAME', name='pool5')
147     return network

筆者偷懶直接用的是TensorLayer庫中的Vgg16模型，至於什麼是tensorlayer請移步這裏github

按照莫煩老師的教程，改寫最後的全鏈接層作二分類學習：api

def fc_layers(net):
    # 全鏈接層前的預處理
    network = FlattenLayer(net, name='flatten')
    # tf.layers.dense(self.flatten, 256, tf.nn.relu, name='fc6')
    network = DenseLayer(network, n_units=256, act=tf.nn.relu, name='fc1_relu')
    # network = DenseLayer(network, n_units=4096, act=tf.nn.relu, name='fc2_relu')
    # self.out = tf.layers.dense(self.fc6, 1, name='out')
    network = DenseLayer(network, n_units=2, act=tf.identity, name='fc3_relu')
    return network

定義輸入輸出以及損失函數已及學習步驟：app

 1 # 輸入
 2 x = tf.placeholder(tf.float32, [None, 224, 224, 3])
 3 # 輸出
 4 y_ = tf.placeholder(tf.int32, shape=[None, ], name='y_')
 5 net_in = InputLayer(x, name='input')
 6 # net_cnn = conv_layers(net_in)               # professional CNN APIs
 7 net_cnn = conv_layers_simple_api(net_in)  # simplified CNN APIs
 8 network = fc_layers(net_cnn)
 9 y = network.outputs
10 # probs = tf.nn.softmax(y)
11 y_op = tf.argmax(tf.nn.softmax(y), 1)
12 cost = tl.cost.cross_entropy(y, y_, name='cost')
13 correct_prediction = tf.equal(tf.cast(tf.argmax(y, 1), tf.float32), tf.cast(y_, tf.float32))
14 acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
15 # 定義 optimizer
16 train_params = network.all_params[26:]
17 # print(train_params)
18 global_step = tf.Variable(0)
19 # --------------學習速率的設置（學習速率呈指數降低）--------------------- #將 global_step/decay_steps 強制轉換爲整數
20 # learning_rate = tf.train.exponential_decay(1e-2, global_step, decay_steps=1000, decay_rate=0.98, staircase=True)
21 train_op = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999,
22                                   epsilon=1e-08, use_locking=False).minimize(cost, var_list=train_params)

讀取數據讀取訓練、驗證數據，加載模型參數：dom

 1 img, label = read_and_decode("F:\\001-python\\train.tfrecords")
 2 img_v, label_v = read_and_decode("F:\\001-python\\val.tfrecords")
 3 # 使用shuffle_batch能夠隨機打亂輸入
 4 X_train, y_train = tf.train.shuffle_batch([img, label],
 5                                           batch_size=30, capacity=400,
 6                                           min_after_dequeue=300)
 7 X_Val, y_val = tf.train.shuffle_batch([img_v, label_v],
 8                                       batch_size=30, capacity=400,
 9                                       min_after_dequeue=300)
10 tl.layers.initialize_global_variables(sess)
11 network.print_params()
12 network.print_layers()
13 npz = np.load('vgg16_weights.npz')
14 params = []
15 for val in sorted(npz.items())[0:25]:
16     # print("  Loading %s" % str(val[1].shape))
17     params.append(val[1])
18 加載預訓練的參數
19 tl.files.assign_params(sess, params, network)

加載好以後，開始訓練，200個epoch：ide

 1 for epoch in range(n_epoch):
 2     start_time = time.time()
 3     val, l = sess.run([X_train, y_train])
 4     for X_train_a, y_train_a in tl.iterate.minibatches(val, l, batch_size, shuffle=True):
 5         sess.run(train_op, feed_dict={x: X_train_a, y_: y_train_a})
 6     if epoch + 1 == 1 or (epoch + 1) % 5 == 0:
 7         print("Epoch %d of %d took %fs" % (epoch + 1, n_epoch, time.time() - start_time))
 8         train_loss, train_acc, n_batch = 0, 0, 0
 9         for X_train_a, y_train_a in tl.iterate.minibatches(val, l, batch_size, shuffle=True):
10             err, ac = sess.run([cost, acc], feed_dict={x: X_train_a, y_: y_train_a})
11             train_loss += err
12             train_acc += ac
13             n_batch += 1
14         print("   train loss: %f" % (train_loss / n_batch))
15         print("   train acc: %f" % (train_acc / n_batch))

保存訓練的參數：函數

1 tl.files.save_npz(network.all_params, name='model.npz', sess=sess)

下面就是開始訓練啦，筆者很高興的拿着本身的筆記本顯卡呼呼的跑了一遍:學習

~~~~~~~~~~~~~~~~~~~~~~~~下面是漫長的等待優化

.......
[TL] Epoch 138 of 150 took 0.999402s
[TL]    val loss: 0.687194
[TL]    val acc: 0.562500
[TL] Epoch 140 of 150 took 3.782207s
[TL]    val loss: 0.619966
[TL]    val acc: 0.750000
[TL] Epoch 142 of 150 took 0.983802s
[TL]    val loss: 0.685686
[TL]    val acc: 0.562500
[TL] Epoch 144 of 150 took 0.986604s
[TL]    val loss: 0.661224
[TL]    val acc: 0.687500
[TL] Epoch 146 of 150 took 1.022403s
[TL]    val loss: 0.675885
[TL]    val acc: 0.687500
[TL] Epoch 148 of 150 took 0.991802s
[TL]    val loss: 0.682124
[TL]    val acc: 0.625000
[TL] Epoch 150 of 150 took 3.487811s
[TL]    val loss: 0.674932
[TL]    val acc: 0.687500
[TL] Total training time: 319.859640s
[TL] [*] model.npz saved

額~~~~~~~~~~~~~~~~~

0.68的正確率，羣裏一位朋友看了以後說：跟猜差很少了(一臉黑線)。問題出哪兒呢？難道是筆者訓練的次數不夠多？莫煩老師但是100次就能出很好的結果啊

無論怎麼樣，要試試，筆者因而加載剛剛保存的model.npz參數繼續跑100個epoch

~~~~~~~~~~~~~~~~~~~~~~~~又是漫長的等待

[TL] Epoch 1 of 100 took 8.477617s
[TL]    val loss: 0.685957
[TL]    val acc: 0.562500
[TL] Epoch 2 of 100 took 0.999402s
[TL]    val loss: 0.661529
[TL]    val acc: 0.625000
......
[TL] Epoch 94 of 100 took 0.992208s
[TL]    val loss: 0.708815
[TL]    val acc: 0.562500
[TL] Epoch 96 of 100 took 0.998406s
[TL]    val loss: 0.710636
[TL]    val acc: 0.562500
[TL] Epoch 98 of 100 took 0.992807s
[TL]    val loss: 0.621505
[TL]    val acc: 0.687500
[TL] Epoch 100 of 100 took 0.986405s
[TL]    val loss: 0.670647
[TL]    val acc: 0.625000
[TL] Total training time: 156.734633s
[TL] [*] model.npz saved

坑爹啊這是，還不如以前的結果。

筆者陷入深深的沉思中，難道是改了全鏈接層致使的？因而筆者又把以前去掉的全鏈接層加上：

1 def fc_layers(net):
2     # 全鏈接層前的預處理
3     network = FlattenLayer(net, name='flatten')
4     # tf.layers.dense(self.flatten, 256, tf.nn.relu, name='fc6')
5     network = DenseLayer(network, n_units=256, act=tf.nn.relu, name='fc1_relu')
6     network = DenseLayer(network, n_units=256, act=tf.nn.relu, name='fc2_relu')
7     # self.out = tf.layers.dense(self.fc6, 1, name='out')
8     network = DenseLayer(network, n_units=2, act=tf.identity, name='fc3_relu')
9     return network

接着訓練

~~~~~~~~~~~~~~~~~~~~~~~~下面又是漫長的等待

1 [TL] Epoch 1 of 100 took 8.477229s
2 [TL]    val loss: 2.370650
3 [TL]    val acc: 0.562500
4 ...
5 [TL] Epoch 100 of 100 took 1.016002s
6 [TL]    val loss: 0.762171
7 [TL]    val acc: 0.437500
8 [TL] Total training time: 156.836465s
9 [TL] [*] model.npz saved

仍是同樣，筆者已崩潰了，必定是哪兒不對啊啊啊....因而筆者去翻莫煩老師的代碼，一點點對下來，每一層參數確定不會有錯，那就是在訓練設置的參數有問題。

1 self.train_op = tf.train.RMSPropOptimizer(0.001).minimize(self.loss) #莫煩的代碼
2 train_op = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999,
3                                   epsilon=1e-08, use_locking=False).minimize(cost, var_list=train_params)#筆者的

看到train_params難道是這個train_params？筆者只優化了最後的全鏈接層參數而莫煩老師優化的是所有參數

已經深夜了，筆者表示即便不睡覺也要跑一遍試試，因而改爲

 1 # 定義 optimizer
 2 train_params = network.all_params
 3 ~~~~~~~~~~~~~~~~~~~~~~~~因而又是是漫長的等待
 4 
 5 [TL] Epoch 1 of 100 took 20.286640s
 6 [TL]    val loss: 11.938850
 7 [TL]    val acc: 0.312500
 8 [TL] Epoch 2 of 100 took 3.091806s
 9 [TL]    val loss: 2.890055
10 [TL]    val acc: 0.625000
11 [TL] Epoch 4 of 100 took 3.074205s
12 [TL]    val loss: 24.055895
13 [TL]    val acc: 0.687500
14 [TL] ....
15 [TL]    val loss: 0.699907
16 [TL]    val acc: 0.500000
17 [TL] Epoch 98 of 100 took 3.089206s
18 [TL]    val loss: 0.683627
19 [TL]    val acc: 0.562500
20 [TL] Epoch 100 of 100 took 3.091806s
21 [TL]    val loss: 0.708496
22 [TL]    val acc: 0.562500
23 [TL] Total training time: 375.727307s
24 [TL] [*] model.npz saved

效果變得更差了....

排除參數的問題，已經深夜1點了，明天還要上班，不得不睡啦。

繼續崩潰第三天~~~

第四天~~~

第五天，今天供應商過來公司調試機器，正好是一個學圖像處理的小夥子，我提到這個說：我爲啥訓練了這麼多代爲啥仍是像猜同樣的機率....？小夥兒說：莫不是過擬合了吧？我說:不可能啊現成的數據現成的模型和參數，不該該的啊！

不過我仍是得檢查一下數據處理的代碼

 1 # 生成是數據文件
 2 def create_record(filelist):
 3     random.shuffle(filelist)
 4     i = 0
 5     writer = tf.python_io.TFRecordWriter(recordpath)
 6     for file in filelist:
 7         name = file.split(sep='.')
 8         lable_val = 0
 9         if name[0] == 'cat':
10             lable_val = 0
11         else:
12             lable_val = 1
13         img_path = file_dir + file
14         img = Image.open(img_path)
15         img = img.resize((240, 240))
16         img_raw = img.tobytes()  # 將圖片轉化爲原生bytes
17         example = tf.train.Example(features=tf.train.Features(feature={
18             "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[lable_val])),
19             'img_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_raw]))
20        })) #example對象對label和image進行封裝
21         writer.write(example.SerializeToString())
22         i=i+1
23         print(name[1])
24         print(lable_val)
25         print(i)
26     writer.close()
27 # 用隊列形式讀取文件
28 def read_and_decode(filename):
29     # 根據文件名生成一個隊列
30     filename_queue = tf.train.string_input_producer([filename])
31     reader = tf.TFRecordReader()
32     _, serialized_example = reader.read(filename_queue)  # 返回文件名和文件
33     features = tf.parse_single_example(serialized_example,
34                                        features={
35                                            'label': tf.FixedLenFeature([], tf.int64),
36                                            'img_raw': tf.FixedLenFeature([], tf.string),
37                                        })
38     img = tf.decode_raw(features['img_raw'], tf.uint8)
39     img = tf.reshape(img, [224, 224, 3])
40     img = tf.cast(img, tf.float32) * (1. / 255) - 0.5
41     label = tf.cast(features['label'], tf.int32)
42     return img, label

img = tf.cast(img, tf.float32) * (1. / 255) - 0.5 難道是這一步處理多餘？註銷以後，訓練模型

 1 Epoch 85 of 200 took 1.234071s
 2    train loss: 14.689816
 3    train acc: 0.900000
 4 [TL] [*] model3.npz saved
 5 Epoch 90 of 200 took 1.241071s
 6    train loss: 17.104382
 7    train acc: 0.800000
 8 [TL] [*] model3.npz saved
 9 Epoch 95 of 200 took 1.236071s
10    train loss: 11.190630
11    train acc: 0.850000
12 [TL] [*] model3.npz saved
13 Epoch 100 of 200 took 1.238071s
14    train loss: 0.000000
15    train acc: 1.000000
16 [TL] [*] model3.npz saved
17 Epoch 105 of 200 took 1.236071s
18    train loss: 7.622324
19    train acc: 0.900000
20 [TL] [*] model3.npz saved
21 Epoch 110 of 200 took 1.234071s
22    train loss: 2.164670
23    train acc: 0.950000
24 [TL] [*] model3.npz saved
25 Epoch 115 of 200 took 1.237071s
26    train loss: 0.000000
27    train acc: 1.000000
28 [TL] [*] model3.npz saved