手寫數字識別,神經網絡領域的「hello world」例子,經過pytorch一步步構建,經過訓練與調整,達到「100%」準確率python
一、快速開始git
1.1 定義神經網絡類,繼承torch.nn.Module,文件名爲digit_recog.pywindows
1 import torch.nn as nn 2 3 4 class Net(nn.Module): 5 def __init__(self): 6 super(Net, self).__init__() 7 self.conv1 = nn.Sequential(nn.Conv2d(1, 6, 5, 1, 2) 8 , nn.ReLU() 9 , nn.MaxPool2d(2, 2)) 10 self.conv2 = nn.Sequential(nn.Conv2d(6, 16, 5) 11 , nn.ReLU() 12 , nn.MaxPool2d(2, 2)) 13 self.fc1 = nn.Sequential( 14 nn.Linear(16 * 5 * 5, 120), 15 # nn.Dropout2d(), 16 nn.ReLU() 17 ) 18 self.fc2 = nn.Sequential( 19 nn.Linear(120, 84), 20 nn.Dropout2d(), 21 nn.ReLU() 22 ) 23 self.fc3 = nn.Linear(84, 10) 24 25 # 前向傳播 26 def forward(self, x): 27 x = self.conv1(x) 28 x = self.conv2(x) 29 # 線性層的輸入輸出都是一維數據,因此要把多維度的tensor展平成一維 30 x = x.view(x.size()[0], -1) 31 x = self.fc1(x) 32 x = self.fc2(x) 33 x = self.fc3(x) 34 return x
上面的類定義了一個3層的網絡結構,根據問題類型,最後一層是肯定的網絡
1.2 開始訓練:app
import torch import torchvision as tv import torchvision.transforms as transforms import torch.nn as nn import torch.optim as optim import os import copy import time from digit_recog import Net from digit_recog_mydataset import MyDataset # 讀取已保存的模型 def getmodel(pth, net): state_filepath = pth if os.path.exists(state_filepath): # 加載參數 nn_state = torch.load(state_filepath) # 加載模型 net.load_state_dict(nn_state) # 拷貝一份 return copy.deepcopy(nn_state) else: return net.state_dict() # 構建數據集 def getdataset(batch_size): # 定義數據預處理方式 transform = transforms.ToTensor() # 定義訓練數據集 trainset = tv.datasets.MNIST( root='./data/', train=True, download=True, transform=transform) # 去掉註釋,加入本身的數據集 # trainset += MyDataset(os.path.abspath("./data/myimages/"), 'train.txt', transform=transform) # 定義訓練批處理數據 trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, shuffle=True, ) # 定義測試數據集 testset = tv.datasets.MNIST( root='./data/', train=False, download=True, transform=transform) # 去掉註釋,加入本身的數據集 # testset += MyDataset(os.path.abspath("./data/myimages/"), 'test.txt', transform=transform) # 定義測試批處理數據 testloader = torch.utils.data.DataLoader( testset, batch_size=batch_size, shuffle=False, ) return trainloader, testloader # 訓練 def training(device, net, model, dataset_loader, epochs, criterion, optimizer, save_model_path): trainloader, testloader = dataset_loader # 最佳模型 best_model_wts = model # 最好分數 best_acc = 0.0 # 計時 since = time.time() for epoch in range(epochs): sum_loss = 0.0 # 訓練數據集 for i, data in enumerate(trainloader): inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # 梯度清零,避免帶入下一輪累加 optimizer.zero_grad() # 神經網絡運算 outputs = net(inputs) # 損失值 loss = criterion(outputs, labels) # 損失值反向傳播 loss.backward() # 執行優化 optimizer.step() # 損失值彙總 sum_loss += loss.item() # 每訓練完100條數據就顯示一下損失值 if i % 100 == 99: print('[%d, %d] loss: %.03f' % (epoch + 1, i + 1, sum_loss / 100)) sum_loss = 0.0 # 每訓練完一輪測試一下準確率 with torch.no_grad(): correct = 0 total = 0 for data in testloader: images, labels = data images, labels = images.to(device), labels.to(device) outputs = net(images) # 取得分最高的 _, predicted = torch.max(outputs.data, 1) # print(labels) # print(torch.nn.Softmax(dim=1)(outputs.data).detach().numpy()[0]) # print(torch.nn.functional.normalize(outputs.data).detach().numpy()[0]) total += labels.size(0) correct += (predicted == labels).sum() print('測試結果:{}/{}'.format(correct, total)) epoch_acc = correct.double() / total print('當前分數:{} 最高分數:{}'.format(epoch_acc, best_acc)) if epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(net.state_dict()) print('第%d輪的識別準確率爲:%d%%' % (epoch + 1, (100 * correct / total))) time_elapsed = time.time() - since print('訓練完成於 {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('最高分數: {:4f}'.format(best_acc)) # 保存訓練模型 if save_model_path is not None: save_state_path = os.path.join('model/', 'net.pth') torch.save(best_model_wts, save_state_path) # 基於cpu仍是gpu DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") NET = Net().to(DEVICE) # 超參數設置 EPOCHS = 8# 訓練多少輪 BATCH_SIZE = 64 # 數據集批處理數量 64 LR = 0.001 # 學習率 # 交叉熵損失函數,一般用於多分類問題上 CRITERION = nn.CrossEntropyLoss() # 優化器 # OPTIMIZER = optim.SGD(net.parameters(), lr=LR, momentum=0.9) OPTIMIZER = optim.Adam(NET.parameters(), lr=LR) MODEL = getmodel(os.path.join('model/', 'net.pth'), NET) training(DEVICE, NET, MODEL, getdataset(BATCH_SIZE), 1, CRITERION, OPTIMIZER, os.path.join('model/', 'net.pth'))
利用標準的mnist數據集跑出來的識別率能達到99%函數
二、參與進來工具
目的是爲了識別本身的圖片,增長參與感學習
2.1 打開windows附件中的畫圖工具,用鼠標畫幾個數字,而後用截圖工具保存下來測試
2.2 實現本身的數據集:字體
digit_recog_mydataset.py
from PIL import Image import torch import os # 實現本身的數據集 class MyDataset(torch.utils.data.Dataset): def __init__(self, root, datafile, transform=None, target_transform=None): super(MyDataset, self).__init__() fh = open(os.path.join(root, datafile), 'r') datas = [] for line in fh: # 刪除本行末尾的字符 line = line.rstrip() # 經過指定分隔符對字符串進行拆分,默認爲全部的空字符,包括空格、換行、製表符等 words = line.split() # words[0]是圖片信息,words[1]是標籤 datas.append((words[0], int(words[1]))) self.datas = datas self.transform = transform self.target_transform = target_transform self.root = root # 必須實現的方法,用於按照索引讀取每一個元素的具體內容 def __getitem__(self, index): # 獲取圖片及標籤,即上面每行中word[0]和word[1]的信息 img, label = self.datas[index] # 打開圖片,重設尺寸,轉換爲灰度圖 img = Image.open(os.path.join(self.root, img)).resize((28, 28)).convert('L') # 數據預處理 if self.transform is not None: img = self.transform(img) return img, label # 必須實現的方法,返回數據集的長度 def __len__(self): return len(self.datas)
2.3 在圖片文件夾中新建兩個文件,train.txt和test.txt,分別寫上訓練與測試集的數據,格式以下
訓練與測試的數據要嚴格區分開,不然訓練出來的模型會有問題
2.4 加入訓練、測試數據集
反註釋訓練方法中的這兩行
# trainset += MyDataset(os.path.abspath("./data/myimages/"), 'train.txt', transform=transform) # testset += MyDataset(os.path.abspath("./data/myimages/"), 'test.txt', transform=transform)
繼續執行訓練,這裏我訓練出來的最高識別率是98%
2.5 測試模型
# -*- coding: utf-8 -*- # encoding:utf-8 import torch import numpy as np from PIL import Image import os import matplotlib import matplotlib.pyplot as plt import glob from digit_recog import Net device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = Net().to(device) # 加載參數 nn_state = torch.load(os.path.join('model/', 'net.pth')) # 參數加載到指定模型 net.load_state_dict(nn_state) # 指定默認字體 matplotlib.rcParams['font.sans-serif'] = ['SimHei'] matplotlib.rcParams['font.family'] = 'sans-serif' # 解決負號'-'顯示爲方塊的問題 matplotlib.rcParams['axes.unicode_minus'] = False # 要識別的圖片 file_list = glob.glob(os.path.join('data/test_image/', '*')) grid_rows = len(file_list) / 5 + 1 for i, file in enumerate(file_list): # 讀取圖片並重設尺寸 image = Image.open(file).resize((28, 28)) # 灰度圖 gray_image = image.convert('L') # 圖片數據處理 im_data = np.array(gray_image) im_data = torch.from_numpy(im_data).float() im_data = im_data.view(1, 1, 28, 28) # 神經網絡運算 outputs = net(im_data) # 取最大預測值 _, pred = torch.max(outputs, 1) # print(torch.nn.Softmax(dim=1)(outputs).detach().numpy()[0]) # print(torch.nn.functional.normalize(outputs).detach().numpy()[0]) # 顯示圖片 plt.subplot(grid_rows, 5, i + 1) plt.imshow(gray_image) plt.title(u"你是{}?".format(pred.item()), fontsize=8) plt.axis('off') print('[{}]預測數字爲: [{}]'.format(file, pred.item())) plt.show()
可視化結果
這批圖片是通過圖片加強後識別的結果,準確率有待提升
三、優化
3.1 更多樣本:
收集難度大
3.2 數據加強:
簡單地處理一下本身手寫的數字圖片
# -*- coding: utf-8 -*- # encoding:utf-8 import torch import numpy as np from PIL import Image import os import matplotlib import matplotlib.pyplot as plt import glob from scipy.ndimage import filters class ImageProcceed: def __init__(self, image_folder): self.image_folder = image_folder def save(self, rotate, filter=None, to_gray=True): file_list = glob.glob(os.path.join(self.image_folder, '*.png')) print(len(file_list)) for i, file in enumerate(file_list): # 讀取圖片數據 image = Image.open(file) # .resize((28, 28)) # 灰度圖 if to_gray == True: image = image.convert('L') # 旋轉 image = image.rotate(rotate) if filter is not None: image = filters.gaussian_filter(image, 0.5) image = Image.fromarray(image) filename = os.path.basename(file) fileext = os.path.splitext(filename)[1] savefile = filename.replace(fileext, '-rt{}{}'.format(rotate, fileext)) print(savefile) image.save(os.path.join(self.image_folder, savefile)) ip = ImageProcceed('data/myimages/') ip.save(20, filter=0.5)
3.3 改變網絡大小:
好比把上面的Net類中的3層改成2層
3.4 調參:
改變學習率,訓練更屢次數等
後面我調整了Net類中的兩個地方,準確率終於達到100%,這只是在我小批量測試集上的表現而已,而現實中預測是不可能達到100%的,每臺機器可能有差別,每次運行的結果會有不一樣,再次帖出代碼
1 import torch.nn as nn 2 3 4 class Net(nn.Module): 5 def __init__(self): 6 super(Net, self).__init__() 7 # 卷積: 1通道輸入,6通道輸出,卷積核5*5,步長1,先後補2個0 8 # 激活函數通常用ReLU,後面改良的有LeakyReLU/PReLU 9 # MaxPool2d池化,通常是2 10 self.conv1 = nn.Sequential(nn.Conv2d(1, 6, 5, 1, 2) 11 , nn.PReLU() 12 , nn.MaxPool2d(2, 2)) 13 self.conv2 = nn.Sequential(nn.Conv2d(6, 16, 5) 14 , nn.PReLU() 15 , nn.MaxPool2d(2, 2)) 16 self.fc1 = nn.Sequential( 17 nn.Linear(16 * 5 * 5, 120), # 卷積輸出16,乘以卷積核5*5 18 # nn.Dropout2d(), # Dropout接收來自linear的數據,Dropout2d接收來自conv2d的數據 19 nn.PReLU() 20 ) 21 self.fc2 = nn.Sequential( 22 nn.Linear(120, 84), 23 nn.Dropout(p=0.2), 24 nn.PReLU() 25 ) 26 self.fc3 = nn.Linear(84, 10) # 輸出層節點爲10,表明數字0-9 27 28 # 前向傳播 29 def forward(self, x): 30 x = self.conv1(x) 31 x = self.conv2(x) 32 # 線性層的輸入輸出都是一維數據,因此要把多維度的tensor展平成一維 33 x = x.view(x.size()[0], -1) 34 x = self.fc1(x) 35 x = self.fc2(x) 36 x = self.fc3(x) 37 return x
上面改了兩個地方,一個是激活函數ReLU改爲了PReLU,正則化Dropout用0.2做爲參數,下面是再次運行測試後的結果