本週的學習重心依舊是深度學習,主要實踐項目爲基於TensorFlow深度學習框架的字母、數字組成的驗證碼識別。在上週進行了mnist手寫數字識別以後,本覺得驗證碼識別是一件很簡單的事,但實踐起來發現並非那麼回事,首先在訓練量上,手寫數字識別的識別類型只有0-9十個數字十種類型,且僅有一個手寫數字參與識別,而驗證碼識別的識別類型有字母+數字共26+26+10=62種,且每次有四個字符參與識別,即,每次可能的結果有62的4次方 種。再而後就是數據集,手寫數字有TensorFlow入門級的mnist數據集,網上有現成的能夠直接下載使用,而驗證碼則沒有一套專門的數據集,再加上驗證碼圖片大小的不一致性,給數據集的處理又增長了難度。html
訓練神經網絡的驗證碼數據集由代碼生成,因爲數量過多沒有保存數據集文件,只保存了訓練好的神經網絡模型(本意是將識別準確率提升到98%以上,但實踐中發現達到92%都很難,在訓練時間達到十個小時的時候識別準確率在87%左右,在訓練時間達到十九個小時的時候識別準確率仍在88%左右徘徊,而且不多出現90%+的狀況,時間問題在準確率出現92.25%時中止了訓練,共訓練52000次,每次64個驗證碼數據被喂入)java
因爲網上沒有驗證碼的數據集,因此參考了百度生成驗證碼圖片的博文,幾乎全部博文都用的相同方法生成數據集,在此不具體列出連接git
另參考騰訊視頻TensorFlow相關課程數組
程序源碼以下:網絡
1 import tensorflow as tf 2 from captcha.image import ImageCaptcha 3 import numpy as np 4 from PIL import Image 5 import matplotlib.pyplot as plt 6 import os 7 import tkinter.filedialog 8 import random 9 10 number = ['0','1','2','3','4','5','6','7','8','9'] 11 alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] 12 ALPHABET = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] 13 14 # 傳入數據集,從數據集中隨機選擇四個元素,而後返回這四個元素 15 def random_captcha_text(char_set=number+alphabet+ALPHABET, captcha_size=4): 16 # def random_captcha_text(char_set=number, captcha_size=4): 17 captcha_text = [] 18 for i in range(captcha_size): 19 c = random.choice(char_set) 20 captcha_text.append(c) 21 return captcha_text 22 23 # 生成驗證碼圖片,返回圖片轉化後的numpy數組,以及驗證碼字符文本 24 def gen_captcha_text_and_image(): 25 image = ImageCaptcha() 26 captcha_text = random_captcha_text() 27 captcha_text = ''.join(captcha_text) 28 captcha = image.generate(captcha_text) 29 # image.write(captcha_text, captcha_text + '.jpg') # 將圖片保存到硬盤 30 captcha_image = Image.open(captcha) 31 captcha_image = captcha_image.convert('L') 32 captcha_image = captcha_image.point(lambda i: 255 - i) 33 # 將圖片取反,黑色變爲白色,白色變爲黑色,這樣模型收斂更塊 34 captcha_image = np.array(captcha_image) 35 return captcha_text, captcha_image 36 37 def text2vec(text): 38 text_len = len(text) 39 if text_len > MAX_CAPTCHA: 40 raise ValueError('驗證碼最長4個字符') 41 vector = np.zeros(MAX_CAPTCHA*CHAR_SET_LEN) 42 def char2pos(c): 43 if c =='_': 44 k = 62 45 return k 46 k = ord(c)-48 47 if k > 9: 48 k = ord(c) - 55 49 if k > 35: 50 k = ord(c) - 61 51 if k > 61: 52 raise ValueError('No Map') 53 return k 54 for i, c in enumerate(text): 55 idx = i * CHAR_SET_LEN + char2pos(c) 56 vector[idx] = 1 57 return vector 58 59 # 向量轉回文本 60 def vec2text(vec): 61 char_pos = vec.nonzero()[0] 62 text = [] 63 for i, c in enumerate(char_pos): 64 char_at_pos = i # c/63 65 char_idx = c % CHAR_SET_LEN 66 if char_idx < 10: 67 char_code = char_idx + ord('0') 68 elif char_idx < 36: 69 char_code = char_idx - 10 + ord('A') 70 elif char_idx < 62: 71 char_code = char_idx - 36 + ord('a') 72 elif char_idx == 62: 73 char_code = ord('_') 74 else: 75 raise ValueError('error') 76 text.append(chr(char_code)) 77 return "".join(text) 78 79 def get_next_batch(batch_size=64): 80 batch_x = np.zeros([batch_size, IMAGE_HEIGHT * IMAGE_WIDTH]) 81 batch_y = np.zeros([batch_size, MAX_CAPTCHA * CHAR_SET_LEN]) 82 83 # 有時生成圖像大小不是(60,160,3) 84 def wrap_get_label_and_image(): 85 # 獲取一張圖,判斷其是否符合(60,160,3) 86 while True: 87 text, image = gen_captcha_text_and_image() 88 if image.shape == (60, 160, 3): 89 return text, image 90 91 for i in range(batch_size): 92 text, image = gen_captcha_text_and_image() 93 batch_x[i, :] = image.flatten() # 將二維數組拉平爲一維 94 batch_y[i, :] = text2vec(text) 95 96 return batch_x, batch_y 97 98 # 把彩色圖像轉爲灰度圖像(色彩對識別驗證碼沒有什麼用) 99 def convert2gray(img): 100 if len(img.shape) > 2: 101 gray = np.mean(img, -1) 102 # 上面的轉法較快,正規轉法以下 103 # r, g, b = img[:,:,0], img[:,:,1], img[:,:,2] 104 # gray = 0.2989 * r + 0.5870 * g + 0.1140 * b 105 return gray 106 else: 107 return img 108 109 def crack_captcha_cnn(w_alpha=0.01, b_alpha=0.1): 110 x = tf.reshape(X, shape=[-1, IMAGE_HEIGHT, IMAGE_WIDTH, 1]) 111 112 # 第一層卷積-池化 113 w_c1 = tf.Variable(tf.random_normal([3, 3, 1, 32], stddev=w_alpha)) 114 b_c1 = tf.Variable(tf.random_normal([32], stddev=b_alpha)) 115 conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1)) 116 conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 117 conv1 = tf.nn.dropout(conv1, keep_prob) 118 119 # 第二層卷積-池化 120 w_c2 = tf.Variable(tf.random_normal([3, 3, 32, 64], stddev=w_alpha)) 121 b_c2 = tf.Variable(tf.random_normal([64], stddev=b_alpha)) 122 conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2, strides=[1, 1, 1, 1], padding='SAME'), b_c2)) 123 conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 124 conv2 = tf.nn.dropout(conv2, keep_prob) 125 126 # 第三層卷積-池化 127 w_c3 = tf.Variable(tf.random_normal([3, 3, 64, 64], stddev=w_alpha)) 128 b_c3 = tf.Variable(tf.random_normal([64], stddev=b_alpha)) 129 conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, w_c3, strides=[1, 1, 1, 1], padding='SAME'), b_c3)) 130 conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 131 conv3 = tf.nn.dropout(conv3, keep_prob) 132 133 # 全連層 134 w_d = tf.Variable(tf.random_normal([8 * 32 * 40, 1024], stddev=w_alpha)) 135 b_d = tf.Variable(tf.random_normal([1024], stddev=b_alpha)) 136 dense = tf.reshape(conv3, [-1, w_d.get_shape().as_list()[0]]) 137 dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d)) 138 dense = tf.nn.dropout(dense, keep_prob) 139 140 # 輸出層 141 w_out = tf.Variable(tf.random_normal([1024, MAX_CAPTCHA * CHAR_SET_LEN], stddev=w_alpha)) 142 b_out = tf.Variable(tf.random_normal([MAX_CAPTCHA * CHAR_SET_LEN], stddev=b_alpha)) 143 out = tf.add(tf.matmul(dense, w_out), b_out) 144 return out 145 146 def train_crack_captcha_cnn(): 147 output = crack_captcha_cnn() 148 # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y)) 149 loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, labels=Y)) 150 # 最後一層用來分類的softmax和sigmoid,能夠本身選擇 151 # optimizer 爲了加快訓練 learning_rate應該開始大,而後慢慢衰 152 optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) 153 154 predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN]) 155 max_idx_p = tf.argmax(predict, 2) 156 max_idx_l = tf.argmax(tf.reshape(Y, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2) 157 correct_pred = tf.equal(max_idx_p, max_idx_l) 158 accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 159 160 saver = tf.train.Saver() 161 with tf.Session() as sess: 162 sess.run(tf.global_variables_initializer()) 163 164 step = 0 165 while True: 166 batch_x, batch_y = get_next_batch(64) 167 sess.run(optimizer, feed_dict={X: batch_x, Y: batch_y, keep_prob: 0.8}) 168 # 每10 step計算一次準確率 169 if step % 10 == 0: 170 batch_x_test, batch_y_test = get_next_batch(100) 171 acc, loss_ = sess.run([accuracy, loss], feed_dict={X: batch_x_test, Y: batch_y_test, keep_prob: 0.8}) 172 print("step=%d, loss=%g, acc=%g"%(step, loss_, acc)) 173 saver.save(sess, "./model/crack_capcha1.model", global_step=step) 174 # 若是準確率大於98%,保存模型,完成訓練 175 if acc > 0.9: 176 saver.save(sess, "./model/crack_capcha.model", global_step=step) 177 break 178 step += 1 179 180 181 182 def crack_captcha(captcha_image): 183 output = crack_captcha_cnn() 184 185 saver = tf.train.Saver() 186 with tf.Session() as sess: 187 saver.restore(sess, r"F:\pyProgram\verification_code\model\crack_capcha1.model-52000") 188 189 predict = tf.argmax(tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2) 190 text_list = sess.run(predict, feed_dict={X: [captcha_image], keep_prob: 1}) 191 # text_list = sess.run(predict, feed_dict={X: [captcha_image]}) 192 193 text = text_list[0].tolist() 194 vector = np.zeros(MAX_CAPTCHA*CHAR_SET_LEN) 195 i = 0 196 for n in text: 197 vector[i*CHAR_SET_LEN + n] = 1 198 i += 1 199 return vec2text(vector) 200 201 if __name__ == '__main__': 202 # 圖像大小 203 IMAGE_HEIGHT = 60 204 IMAGE_WIDTH = 160 205 MAX_CAPTCHA = 4 206 print("驗證碼文本字符數", MAX_CAPTCHA) 207 char_set = number + alphabet + ALPHABET 208 # char_set = number 209 CHAR_SET_LEN = len(char_set) 210 211 X = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT * IMAGE_WIDTH]) 212 Y = tf.placeholder(tf.float32, [None, MAX_CAPTCHA * CHAR_SET_LEN]) 213 keep_prob = tf.placeholder(tf.float32) # dropout 214 215 # train_crack_captcha_cnn() 216 root = tkinter.Tk() 217 root.withdraw() 218 default_dir = r"C:\Users\animator\Desktop" 219 file_path = tkinter.filedialog.askopenfilename(title=u'選擇文件', initialdir=(os.path.expanduser(default_dir))) 220 image = Image.open(file_path) 221 plt.imshow(image) 222 image = np.array(image) 223 image = convert2gray(image) 224 image = image.flatten() / 255 225 print("預測結果:"+crack_captcha(image)) 226 plt.show()
爲了防止錯過最佳訓練模型,對每一個訓練模型都進行了保存,後來在出現92.25%的訓練結果後手動結束了程序,訓練過程以下:多線程
可見第52000次訓練結果0.9225app
刪掉多餘的訓練模型後剩餘框架
測試結果(圖片位於桌面,百度隨機下載的一張驗證碼圖片)dom
圖片中驗證碼爲1TjV,可是識別結果爲1TJY,大小寫識別和部分易混淆字母識別不許確ide
解決思路:因爲神經網絡訓練模型單個數字或字母的識別率很高,且模型訓練較容易,能夠增長圖片的分割操做將全部字符一個一個的分開識別,這樣的好處一是提升識別精度,二是避免驗證碼字符個數限制,能夠識別任意個字符組成的驗證碼
目前只有思路,時間問題還未具體實踐
java的學生管理系統樣卷內容和小學期進行的學生管理系統差異不大,用Java代碼實現起來也很是簡單,這裏再也不列出其代碼
除此以外,還實驗了支持下載隊列的多線程網絡爬蟲,代碼以下:
1 from urllib import request 2 import re 3 from bs4 import BeautifulSoup 4 from time import ctime,sleep 5 import os,sys,io 6 import threading 7 # 在當前目錄建立一個urls子目錄,用於保存下載的HTML文件 8 os.makedirs('urls',exist_ok=True) 9 # 下載隊列,入口點的URL會做爲下載隊列的第一個元素,這裏以「極客教程」網站爲例 10 insertURL = ["https://geekori.com"] 11 # 已經處理完的URL會添加到這個隊列中 12 delURL = [] 13 # 負責下載和分析的HTML代碼的函數,該函數會在多個線程中執行 14 def getURL(): 15 while(1): 16 global insertURL 17 global delURL 18 try: 19 if len(insertURL)>0: 20 # 從隊列頭取一個URL 21 html = request.urlopen(insertURL[0]).read() 22 soup = BeautifulSoup(html,'lxml') 23 # 開始分析HTML代碼 24 title = soup.find(name='title').get_text().replace('\n','') 25 fp = open("./urls/"+str(title)+".html","w",encoding='utf-8') 26 # 將HTML代碼保存到相應的文件中 27 fp.write(str(html.decode('utf-8'))) 28 fp.close() 29 # 開始查找全部的a標籤 30 href_ = soup.find_all(name='a') 31 # 對全部的a標籤進行迭代 32 for each in href_: 33 urlStr = each.get('href') 34 if str(urlStr)[:4]=='http' and urlStr not in insertURL: 35 # 添加全部以http開頭而且沒有處理過的URL 36 insertURL.append(urlStr) 37 print(urlStr) 38 # 將處理完的URL添加到delURL隊列中 39 delURL.append(insertURL[0]) 40 # 刪除inserURL中處理完的URL 41 del insertURL[0] 42 except: 43 delURL.append(insertURL[0]) 44 del insertURL[0] 45 continue 46 sleep(2) 47 # 下面的代碼啓動了三個線程運行getURL函數 48 threads = [] 49 t1 = threading.Thread(target=getURL) 50 threads.append(t1) 51 t2 = threading.Thread(target=getURL) 52 threads.append(t2) 53 t3 = threading.Thread(target=getURL) 54 threads.append(t3) 55 56 if __name__=='__main__': 57 for t in threads: 58 t.setDaemon(True) 59 t.start() 60 for tt in threads: 61 tt.join()
運行效果如圖:
urls目錄下如下載的文件如圖:
OK,以上就是本週全部內容了,日後幾周將會複習鞏固暑假的進度,準備開學的考覈,加油!