貓狗識別訓練

下載數據集

下載地址:https://www.kaggle.com/c/dogs-vs-cats/data網絡

下載的訓練集中有2.5W張貓貓狗狗的圖片,我這裏只用訓練集壓縮包就好了,驗證集和測試集均可以從中切分。函數

觀察圖片可得知命名方式,貓圖片爲cat.數字.jpg,狗圖片爲dog.數字.jpg,各有12500張。測試

 

規劃數據

數據須要分紅三份:訓練集、驗證集和測試集。優化

我打算使用1.9W張圖片做爲訓練集,4000張圖片做爲驗證集,2000張圖片做爲測試集。lua

import os,shutil
from tensorflow import keras
import matplotlib.pyplot as plt


#原始圖片存放目錄
origin_dir = './origin/train'

#訓練數據集存儲位置
base_dir = './data'

#訓練集 驗證集 測試集
train_dir = base_dir + '/train'
validation_dir = base_dir + '/validation'
test_dir = base_dir + '/test'

#若是目錄存在先刪掉
if True == os.path.exists(base_dir) :
    shutil.rmtree(base_dir)
os.makedirs(base_dir)

#建立子目錄
validation_dog_dir = validation_dir + '/dog'
validation_cat_dir = validation_dir + '/cat'
test_dog_dir = test_dir + '/dog'
test_cat_dir = test_dir + '/cat'
train_dog_dir = train_dir + '/dog'
train_cat_dir = train_dir + '/cat'

#建立目錄
os.makedirs(validation_dog_dir)
os.makedirs(validation_cat_dir)
os.makedirs(test_dog_dir)
os.makedirs(test_cat_dir)
os.makedirs(train_dog_dir)
os.makedirs(train_cat_dir)


#複製2000張狗圖片到驗證數據集狗目錄
files = ['dog.{}.jpg'.format(i) for i in range(2000)]
for file in files :
    src = os.path.join(origin_dir,file)
    dst = os.path.join(validation_dog_dir,file)
    shutil.copy(src,dst)

#複製2000張貓圖片到驗證數據集貓目錄
files = ['cat.{}.jpg'.format(i) for i in range(2000)]
for file in files :
    src = os.path.join(origin_dir,file)
    dst = os.path.join(validation_cat_dir,file)
    shutil.copy(src,dst)


#複製1000張狗圖片到測試數據集狗目錄
files = ['dog.{}.jpg'.format(i) for i in range(2000,3000)]
for file in files :
    src = os.path.join(origin_dir,file)
    dst = os.path.join(test_dog_dir,file)
    shutil.copy(src,dst)

#複製1000張貓圖片到測試數據集狗目錄
files = ['cat.{}.jpg'.format(i) for i in range(2000,3000)]
for file in files :
    src = os.path.join(origin_dir,file)
    dst = os.path.join(test_cat_dir,file)
    shutil.copy(src,dst)

#複製9500張狗圖片到訓練數據集狗目錄
files = ['dog.{}.jpg'.format(i) for i in range(3000,12500)]
for file in files :
    src = os.path.join(origin_dir,file)
    dst = os.path.join(train_dog_dir,file)
    shutil.copy(src,dst)

#複製9500張貓圖片到訓練數據集貓目錄
files = ['cat.{}.jpg'.format(i) for i in range(3000,12500)]
for file in files :
    src = os.path.join(origin_dir,file)
    dst = os.path.join(train_cat_dir,file)
    shutil.copy(src,dst)

 

搭建網絡結構

img_width=350
img_height=350
img_channel = 3

model = keras.models.Sequential([
    keras.layers.Conv2D(32,(3,3),activation='relu',input_shape=(img_width,img_height,img_channel)),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Conv2D(64,(3,3),activation='relu'),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Conv2D(128,(3,3),activation='relu'),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Conv2D(128,(3,3),activation='relu'),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(512,activation='relu',kernel_regularizer=keras.regularizers.l2()),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1,activation='sigmoid')
])

四層卷積+兩層全鏈接,上了Dropout和正則化抑制過擬合。spa

 

模型編譯

優化器使用adam,損失函數使用二元交叉熵。3d

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

 

數據生成器

因爲數據量過大,先讀取後訓練會致使內存溢出,所以使用生成器的方式去訓練。code

batch_size=32
epochs = 25

train_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)
validation_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)
test_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
    validation_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

 

執行訓練

history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.n // batch_size,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=validation_generator.n // batch_size,
    verbose=1)

 

模型評估

score = model.evaluate(test_generator, steps=test_generator.n // batch_size)
print('測試準確率:{}, 測試loss值: {}'.format(score[1], score[0]))

 

可視化acc和loss曲線

plt.rcParams['font.sans-serif']=['SimHei']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.subplot(1, 2, 1)
plt.plot(acc, label='訓練Acc')
plt.plot(val_acc, label='測試Acc')
plt.title('Acc曲線')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(loss, label='訓練Loss')
plt.plot(val_loss, label='測試Loss')
plt.title('Loss曲線')
plt.legend()
plt.show()

 

因爲海量數據致使訓練的速度超慢,我跑一次程序大概要花費近兩小時,可想而知調參的過程會有多噁心,調了三天把準確率懟到90%左右,不想再懟了。orm

相關文章
相關標籤/搜索