python訓練本身的深度學習keras分類模型

1、爬取百度圖片構建數據集

原理參照這篇博客
http://www.javashuo.com/article/p-uljltjxl-mp.html
但訓練一個VGG/Inception/ResNet等分類模型,須要的圖片是海量的,不建議使用文中urlib/requests + BeautifulSoup 的方法,會很是的慢。這裏推出python的強大爬蟲框架scrapy,原理和連接博客是同樣的,具體就不展開了。代碼放這裏:python

import scrapy
from baidu_image.items import BaiduImageItem
import re
import os
from tqdm import tqdm


kindList = ['keyword1','keyword2','keyword3'] #關鍵詞列表


class BkSpider(scrapy.Spider):
    name = 'bdimg'
    # allowed_domains = ['image.baidu.com']
    start_urls = ['http://image.baidu.com']
    url_start = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='
    url_end = '&ct=&ic=0&lm=-1&width=0&height=0'
    pages = 30 #一共爬取30頁
    step = 30 #每次頁面跳轉的步長,0,30,60...百度圖片的機制

    def start_requests(self):
        print(len(kindList))
        for key_word in kindList:
            path = os.path.join(r'../images', key_word)
            if os.path.exists(path):      #建立名爲key_word的文件夾
                pass
            else:
                os.makedirs(path)
            end = self.pages * self.step
            for page in range(0, end, self.step):
                gsm = hex(page)
                url = self.url_start + key_word + '&pn=' + str(page) + '&gsm=' + str(gsm) + self.url_end #url拼接
                request = scrapy.Request(url,callback=self.get_one_page_urls, dont_filter=False)
                request.meta['kind'] = key_word
                request.meta['page'] = page
                yield request

    def parse(self, response):

        print('>>>>>>>>>>>>>>>>>>>>>>>>')
        item = BaiduImageItem()
        item['img'] = response.body
        item['kind'] = response.meta['kind']
        item['name'] = response.meta['name']
        item['type'] = response.meta['type']
        # print('????????')
        yield item

    def get_one_page_urls(self,response):

        kind = response.meta['kind']
        page = response.meta['page']
        # print(response.body)
        urls = re.findall('"objURL":"(.*?)",', response.body.decode('utf-8'), re.S)
        # print(urls)

        for i in tqdm(range(len(urls))):

            request1 = scrapy.Request(urls[i], callback=self.parse)
            request1.meta['kind'] = kind
            request1.meta['name'] = str(page//self.pages) + '_'+str(i)  #圖片名:頁面_圖片順序 如1_10
            request1.meta['type'] = urls[i].split('.')[-1] #圖片後綴名,jpg,jpeg,png等等

            yield request1

這是scrapy的主進程,其中tqdm 是一個進度條。
爬到的圖片按關鍵詞自動保存到不一樣的文件夾中。
在這裏插入圖片描述git

git源碼 :https://github.com/okfu-DL/baidu_images.git

2、標籤編碼

將關鍵詞轉換爲獨熱碼,須要用到sklearn.preprocessing 中的 LabelEncoder和OneHotEncoder,前者將字符類別轉換爲數字類別,後者將數字類別轉換爲獨熱碼類別。github

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np

path = r'G:\crops_train\check'
dirnames = os.listdir(path)

label_word_dict = {}
word_label_dict = {}

num_label = LabelEncoder().fit_transform(dirnames)
print(type(num_label))
onehot_label = OneHotEncoder(sparse=False).fit_transform(np.asarray(num_label).reshape([-1,1]))
# onehot_label = list(list for each in onehot_label)

for i in range(len(dirnames)):
    label_word_dict[num_label[i]] = dirnames[i]
    word_label_dict[dirnames[i]] = list(onehot_label[i])
with open('label_word_dict.txt', 'w', encoding='utf-8') as f:
    f.write(str(label_word_dict))
with open('word_label_dict.txt', 'w', encoding='utf-8') as f:
    f.write(str(word_label_dict))

這裏須要保存兩個dict,一個是label_word_dict,結構爲{類名:onehot_code};一個是word_label_dict,結構爲{num_code:類名}。前者用於構造數據集時,查找類名得到獨熱碼標籤,後者用於模型預測階段,輸出分類後查找到類名。web

word_label_dict

在這裏插入圖片描述

label_word_dict

在這裏插入圖片描述

3、圖片文件轉numpy adarray文件

先解釋一下爲何要把圖片轉成npy文件。
圖片數量高達百萬張,不能一次性讀入內存,須要分批次讀取。每次讀取一個npy文件訓練,而後釋放內存,讀下一個npy文件。若是不存爲npy文件會有如下三個麻煩:
一、讀取時間過長。每次讀取須要不斷地循環讀入必定數量的圖片並預處理,會佔用大量的時間。
二、循環讀文件會將同類圖片連續讀入,沒有打亂圖片順序,對模型的訓練不友好。
三、爬取的圖片有部分已損壞,應提早剔除。
在這裏插入圖片描述
先上代碼數組

import os
from PIL import Image
from tqdm import tqdm
import numpy as np
import random
import gc
LOC = list(np.zeros((216,),dtype=int))#全局變量,長度216維向量,即對每一類圖片記錄一個當前文件位置。

path = r'G:\crops_train\check'  #圖片位置

Width, Height = 224,224

trainpath = r'G:\crops_train\dataset\train'
testpath = r'G:\crops_train\dataset\test'
valpath = r'G:\crops_train\dataset\val'


#單張圖片轉numpy npdarray
def img_pretreat(file):

    row_img = Image.open(file)
    img = row_img.resize((Width, Height))
    points = np.asanyarray(img, dtype=np.float32)
    points = points * 1. / 255
    points = np.reshape(points, [Width, Height, 3])
    return points


def img2arr_train(path,step):

    if step == 6:
        NUM = 50
    else:
        NUM = 100

    with open('word_label_dict.txt', 'r', encoding='utf-8') as f:
        js = eval(f.read())

    img_data_list = []
    row_label = []

    dirnames = os.listdir(path)
    for cls,dirname in enumerate(tqdm(dirnames)):
        dir = os.path.join(path,dirname)
        start = LOC[cls]
        num = 0
        for parent,_,filenames in os.walk(dir):
            for filename in filenames[start:]:
                LOC[cls] += 1
                try:

                    file = os.path.join(parent,filename)
                    i = random.randint(0, len(img_data_list))
                    img_data = img_pretreat(file)
                    img_data_list.insert(i, img_data)
                    row_label.insert(i,js[dirname])
                    num += 1
                    # print(num)
                    if num > NUM:
                        break
                except Exception as e:
                    # print(e)
                    continue
        print(LOC[cls])

    inputs = np.array(img_data_list)
    labels = np.array(row_label)
    save_path = ''
    if step < 6:
        save_path = trainpath
    if step == 6:
        save_path = valpath
    if step == 7:
        save_path = testpath

    np.save(os.path.join(save_path,'inputs'+str(step)+'.npy'),inputs)
    np.save(os.path.join(save_path,'labels'+str(step)+'.npy'),labels)
    del inputs,labels
    gc.collect()


if __name__ == "__main__":

    for step in range(0,8):
        img2arr_train(path,step)

循環執行img2arr8次,前6次每次在每一個類別中選100張做爲一組訓練集(inputs0-5,labels0-5).第7次每一個類別中選擇50張做爲驗證集(inputs6,labels6),第8次每一個類別中選100張做爲測試集(inputs7,labels7)。個人電腦內存48g,構建一組訓練集大約須要30g的內存,更具本身的內存,適當調整訓練集大小。建議有條件能夠搞個SSD,讀寫文件會快不少。app

注意:

1.圖片插入數組的時候不要用append,要random一個隨機整數,而後用insert方法,目的是爲了shuffle。如今生成,訓練文件6組,驗證文件1組,測試文件1組。
2.每張圖片用PIL.Image.open載入要加try,由於有可能圖片損壞了。此時,若是成功了cnt++,cnt表明處理成功的圖片數量,達到100/50張就break。而LOC[cls]記錄了第cls類文件夾下讀取文件下標的位置,每次讀入不論成功與否都要加1。
3.關於內存釋放,每保存一個文件後,delete數組,而後用gc.collect()函數釋放內存。注意,list對象轉npdarray對象時必定要用np.arrary(),而不能用np.asarray()。由於後者的是映射一個npdarray對象,會形成內存沒法回收。
在這裏插入圖片描述框架

在這裏插入圖片描述
在這裏插入圖片描述

4、keras迭代器函數

keras批次訓練函數model.fit_generator,要寫一個迭代函數,每一個batch的訓練都要從迭代函數中從新獲取輸入數據。不懂原理的看看官方文檔:https://keras-cn.readthedocs.io/en/latest/models/sequential/#fit_generatordom

官方例子:
在這裏插入圖片描述scrapy

def generate_arrays_from_file(trainpath,set_len=21600,file_nums=6,has_remainder=0,batch_size=32):
    
    ''' :param trainsetpath: 訓練集路徑 :param set_len: 訓練文件的圖片數量 :param file_nums: 訓練文件數量 :param has_remainder: 是否有餘數,即has_remainder = 0 if set_len % batch_size == 0 else 1 :param batch_size: 批次大小 :return: '''
    
    cnt = 0 
    pos = 0
    inputs = None
    labels = None
    while 1:
        if cnt % (set_len//batch_size+has_remainder) == 0:  #判斷是否讀完一整個文件
            pos = 0
            seq = cnt//(set_len//batch_size+has_remainder) % file_nums #這次讀取第seq個文件
            del inputs,labels
            inputs = np.load(os.path.join(trainsetpath, 'inputs'+str(seq)+'.npy'))
            labels = np.load(os.path.join(trainsetpath, 'labels'+str(seq)+'.npy'))
        start = pos*batch_size
        end = min((pos+1)*batch_size, set_len-1)
        batch_inputs = inputs[start:end]
        batch_labels = labels[start:end]
        pos += 1
        cnt += 1
        yield (batch_inputs,batch_labels)

其中,cnt每一個step都會自增,pos用來標識當前文件數組的下標。每次讀入新文件置零。
值得一提的是 if cnt % (set_len//batch_size+has_remainder) == 0中的has_remainder,若是batch_size不能整除set_len,則須要多出一輪處理不足batch_size的剩餘數據,end = min((pos+1)*batch_size, set_len-1)相對應,end取下一個batch_size和文件長度中的較小值。ide

5、批次訓練

這裏採用ResNet50,將最後一層Dense層的節點數改成須要的分類數,其他均不變。

# coding=utf-8
from keras.models import Model,load_model
from keras.layers import Input, Dense, BatchNormalization, Conv2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D
from keras.layers import add, Flatten
# from keras.layers.convolutional import Conv2D,MaxPooling2D,AveragePooling2D
from keras.optimizers import SGD
import numpy as np
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

from pretreat import *
from img2arr import img_pretreat

seed = 7
np.random.seed(seed)


def Conv2d_BN(x, nb_filter, kernel_size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        bn_name = name + '_bn'
        conv_name = name + '_conv'
    else:
        bn_name = None
        conv_name = None

    x = Conv2D(nb_filter, kernel_size, padding=padding, strides=strides, activation='relu', name=conv_name)(x)
    x = BatchNormalization(axis=3, name=bn_name)(x)
    return x


def Conv_Block(inpt, nb_filter, kernel_size, strides=(1, 1), with_conv_shortcut=False):
    x = Conv2d_BN(inpt, nb_filter=nb_filter[0], kernel_size=(1, 1), strides=strides, padding='same')
    x = Conv2d_BN(x, nb_filter=nb_filter[1], kernel_size=(3, 3), padding='same')
    x = Conv2d_BN(x, nb_filter=nb_filter[2], kernel_size=(1, 1), padding='same')
    if with_conv_shortcut:
        shortcut = Conv2d_BN(inpt, nb_filter=nb_filter[2], strides=strides, kernel_size=kernel_size)
        x = add([x, shortcut])
        return x
    else:
        x = add([x, inpt])
        return x


def resnet50():
    inpt = Input(shape=(224, 224, 3))
    x = ZeroPadding2D((3, 3))(inpt)
    x = Conv2d_BN(x, nb_filter=64, kernel_size=(7, 7), strides=(2, 2), padding='valid')
    x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)

    x = Conv_Block(x, nb_filter=[64, 64, 256], kernel_size=(3, 3), strides=(1, 1), with_conv_shortcut=True)
    x = Conv_Block(x, nb_filter=[64, 64, 256], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[64, 64, 256], kernel_size=(3, 3))

    x = Conv_Block(x, nb_filter=[128, 128, 512], kernel_size=(3, 3), strides=(2, 2), with_conv_shortcut=True)
    x = Conv_Block(x, nb_filter=[128, 128, 512], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[128, 128, 512], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[128, 128, 512], kernel_size=(3, 3))

    x = Conv_Block(x, nb_filter=[256, 256, 1024], kernel_size=(3, 3), strides=(2, 2), with_conv_shortcut=True)
    x = Conv_Block(x, nb_filter=[256, 256, 1024], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[256, 256, 1024], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[256, 256, 1024], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[256, 256, 1024], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[256, 256, 1024], kernel_size=(3, 3))

    x = Conv_Block(x, nb_filter=[512, 512, 2048], kernel_size=(3, 3), strides=(2, 2), with_conv_shortcut=True)
    x = Conv_Block(x, nb_filter=[512, 512, 2048], kernel_size=(3, 3))
    x = Conv_Block(x, nb_filter=[512, 512, 2048], kernel_size=(3, 3))
    x = AveragePooling2D(pool_size=(7, 7))(x)
    x = Flatten()(x)
    x = Dense(216, activation='softmax')(x)

    model = Model(inputs=inpt, outputs=x)
    model.summary()
    return model


def train():

    val_X = np.load(os.path.join(valpath,'inputs6.npy'))
    val_Y = np.load(os.path.join(valpath,'labels6.npy'))

    print('load ok')
    model = resnet50()
    sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)  # 優化函數,設定學習率(lr)等參數
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    model.fit_generator(generate_arrays_from_file_1(trainpath, batch_size=32), epochs=100, verbose=1, workers=1, steps_per_epoch=4050,validation_data=(val_X,val_Y))
    model.save('model/res50_216class_01.h5')

model.fit_generator(generate_arrays_from_file_1(trainpath, batch_size=32), epochs=100, verbose=1, workers=1, steps_per_epoch=4050,validation_data=(val_X,val_Y))
steps_per_epoch必定要計算正確,不然會有不可預見的錯誤。顧名思義,就是一個epoch的steps數,也就是

(set_len//batch_size+has_remainder)*file_nums

源代碼在這裏:https://github.com/okfu-DL/crops_classify.git