軟件工程學習進度第七週暨暑期學習進度之第七週彙總

時間 2019-11-18

標籤軟件工程學習進度第七暑期彙總简体版

原文原文鏈接

本週的學習重心依舊是深度學習，主要實踐項目爲基於TensorFlow深度學習框架的字母、數字組成的驗證碼識別。在上週進行了mnist手寫數字識別以後，本覺得驗證碼識別是一件很簡單的事，但實踐起來發現並非那麼回事，首先在訓練量上，手寫數字識別的識別類型只有0-9十個數字十種類型，且僅有一個手寫數字參與識別，而驗證碼識別的識別類型有字母+數字共26+26+10=62種，且每次有四個字符參與識別，即，每次可能的結果有62的4次方種。再而後就是數據集，手寫數字有TensorFlow入門級的mnist數據集，網上有現成的能夠直接下載使用，而驗證碼則沒有一套專門的數據集，再加上驗證碼圖片大小的不一致性，給數據集的處理又增長了難度。html

訓練神經網絡的驗證碼數據集由代碼生成，因爲數量過多沒有保存數據集文件，只保存了訓練好的神經網絡模型（本意是將識別準確率提升到98%以上，但實踐中發現達到92%都很難，在訓練時間達到十個小時的時候識別準確率在87%左右，在訓練時間達到十九個小時的時候識別準確率仍在88%左右徘徊，而且不多出現90%+的狀況，時間問題在準確率出現92.25%時中止了訓練，共訓練52000次，每次64個驗證碼數據被喂入）java

因爲網上沒有驗證碼的數據集，因此參考了百度生成驗證碼圖片的博文，幾乎全部博文都用的相同方法生成數據集，在此不具體列出連接git

另參考騰訊視頻TensorFlow相關課程數組

程序源碼以下：網絡

  1 import tensorflow as tf
  2 from captcha.image import ImageCaptcha
  3 import numpy as np
  4 from PIL import Image
  5 import matplotlib.pyplot as plt
  6 import os
  7 import tkinter.filedialog
  8 import random
  9 
 10 number = ['0','1','2','3','4','5','6','7','8','9']
 11 alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
 12 ALPHABET = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
 13 
 14 # 傳入數據集，從數據集中隨機選擇四個元素，而後返回這四個元素
 15 def random_captcha_text(char_set=number+alphabet+ALPHABET, captcha_size=4):
 16 # def random_captcha_text(char_set=number, captcha_size=4):
 17     captcha_text = []
 18     for i in range(captcha_size):
 19         c = random.choice(char_set)
 20         captcha_text.append(c)
 21     return captcha_text
 22 
 23 # 生成驗證碼圖片，返回圖片轉化後的numpy數組，以及驗證碼字符文本
 24 def gen_captcha_text_and_image():
 25     image = ImageCaptcha()
 26     captcha_text = random_captcha_text()
 27     captcha_text = ''.join(captcha_text)
 28     captcha = image.generate(captcha_text)
 29     # image.write(captcha_text, captcha_text + '.jpg')  # 將圖片保存到硬盤
 30     captcha_image = Image.open(captcha)
 31     captcha_image = captcha_image.convert('L')
 32     captcha_image = captcha_image.point(lambda i: 255 - i)
 33     # 將圖片取反，黑色變爲白色，白色變爲黑色，這樣模型收斂更塊
 34     captcha_image = np.array(captcha_image)
 35     return captcha_text, captcha_image
 36 
 37 def text2vec(text):
 38     text_len = len(text)
 39     if text_len > MAX_CAPTCHA:
 40         raise ValueError('驗證碼最長4個字符')
 41     vector = np.zeros(MAX_CAPTCHA*CHAR_SET_LEN)
 42     def char2pos(c):
 43         if c =='_':
 44             k = 62
 45             return k
 46         k = ord(c)-48
 47         if k > 9:
 48             k = ord(c) - 55
 49             if k > 35:
 50                 k = ord(c) - 61
 51                 if k > 61:
 52                     raise ValueError('No Map')
 53         return k
 54     for i, c in enumerate(text):
 55         idx = i * CHAR_SET_LEN + char2pos(c)
 56         vector[idx] = 1
 57     return vector
 58 
 59 # 向量轉回文本
 60 def vec2text(vec):
 61     char_pos = vec.nonzero()[0]
 62     text = []
 63     for i, c in enumerate(char_pos):
 64         char_at_pos = i  # c/63
 65         char_idx = c % CHAR_SET_LEN
 66         if char_idx < 10:
 67             char_code = char_idx + ord('0')
 68         elif char_idx < 36:
 69             char_code = char_idx - 10 + ord('A')
 70         elif char_idx < 62:
 71             char_code = char_idx - 36 + ord('a')
 72         elif char_idx == 62:
 73             char_code = ord('_')
 74         else:
 75             raise ValueError('error')
 76         text.append(chr(char_code))
 77     return "".join(text)
 78 
 79 def get_next_batch(batch_size=64):
 80     batch_x = np.zeros([batch_size, IMAGE_HEIGHT * IMAGE_WIDTH])
 81     batch_y = np.zeros([batch_size, MAX_CAPTCHA * CHAR_SET_LEN])
 82 
 83     # 有時生成圖像大小不是（60,160,3）
 84     def wrap_get_label_and_image():
 85         # 獲取一張圖，判斷其是否符合（60,160,3）
 86         while True:
 87             text, image = gen_captcha_text_and_image()
 88             if image.shape == (60, 160, 3):
 89                 return text, image
 90 
 91     for i in range(batch_size):
 92         text, image = gen_captcha_text_and_image()
 93         batch_x[i, :] = image.flatten()  # 將二維數組拉平爲一維
 94         batch_y[i, :] = text2vec(text)
 95 
 96     return batch_x, batch_y
 97 
 98 # 把彩色圖像轉爲灰度圖像（色彩對識別驗證碼沒有什麼用）
 99 def convert2gray(img):
100     if len(img.shape) > 2:
101         gray = np.mean(img, -1)
102         # 上面的轉法較快，正規轉法以下
103         # r, g, b = img[:,:,0], img[:,:,1], img[:,:,2]
104         # gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
105         return gray
106     else:
107         return img
108 
109 def crack_captcha_cnn(w_alpha=0.01, b_alpha=0.1):
110     x = tf.reshape(X, shape=[-1, IMAGE_HEIGHT, IMAGE_WIDTH, 1])
111 
112     # 第一層卷積-池化
113     w_c1 = tf.Variable(tf.random_normal([3, 3, 1, 32], stddev=w_alpha))
114     b_c1 = tf.Variable(tf.random_normal([32], stddev=b_alpha))
115     conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1))
116     conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
117     conv1 = tf.nn.dropout(conv1, keep_prob)
118 
119     # 第二層卷積-池化
120     w_c2 = tf.Variable(tf.random_normal([3, 3, 32, 64], stddev=w_alpha))
121     b_c2 = tf.Variable(tf.random_normal([64], stddev=b_alpha))
122     conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2, strides=[1, 1, 1, 1], padding='SAME'), b_c2))
123     conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
124     conv2 = tf.nn.dropout(conv2, keep_prob)
125 
126     # 第三層卷積-池化
127     w_c3 = tf.Variable(tf.random_normal([3, 3, 64, 64], stddev=w_alpha))
128     b_c3 = tf.Variable(tf.random_normal([64], stddev=b_alpha))
129     conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, w_c3, strides=[1, 1, 1, 1], padding='SAME'), b_c3))
130     conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
131     conv3 = tf.nn.dropout(conv3, keep_prob)
132 
133     # 全連層
134     w_d = tf.Variable(tf.random_normal([8 * 32 * 40, 1024], stddev=w_alpha))
135     b_d = tf.Variable(tf.random_normal([1024], stddev=b_alpha))
136     dense = tf.reshape(conv3, [-1, w_d.get_shape().as_list()[0]])
137     dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d))
138     dense = tf.nn.dropout(dense, keep_prob)
139 
140     # 輸出層
141     w_out = tf.Variable(tf.random_normal([1024, MAX_CAPTCHA * CHAR_SET_LEN], stddev=w_alpha))
142     b_out = tf.Variable(tf.random_normal([MAX_CAPTCHA * CHAR_SET_LEN], stddev=b_alpha))
143     out = tf.add(tf.matmul(dense, w_out), b_out)
144     return out
145 
146 def train_crack_captcha_cnn():
147     output = crack_captcha_cnn()
148     # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))
149     loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, labels=Y))
150     # 最後一層用來分類的softmax和sigmoid，能夠本身選擇
151     # optimizer 爲了加快訓練 learning_rate應該開始大，而後慢慢衰
152     optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
153 
154     predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN])
155     max_idx_p = tf.argmax(predict, 2)
156     max_idx_l = tf.argmax(tf.reshape(Y, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
157     correct_pred = tf.equal(max_idx_p, max_idx_l)
158     accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
159 
160     saver = tf.train.Saver()
161     with tf.Session() as sess:
162         sess.run(tf.global_variables_initializer())
163 
164         step = 0
165         while True:
166             batch_x, batch_y = get_next_batch(64)
167             sess.run(optimizer, feed_dict={X: batch_x, Y: batch_y, keep_prob: 0.8})
168             # 每10 step計算一次準確率
169             if step % 10 == 0:
170                 batch_x_test, batch_y_test = get_next_batch(100)
171                 acc, loss_ = sess.run([accuracy, loss], feed_dict={X: batch_x_test, Y: batch_y_test, keep_prob: 0.8})
172                 print("step=%d, loss=%g, acc=%g"%(step, loss_, acc))
173                 saver.save(sess, "./model/crack_capcha1.model", global_step=step)
174                 # 若是準確率大於98%,保存模型,完成訓練
175                 if acc > 0.9:
176                     saver.save(sess, "./model/crack_capcha.model", global_step=step)
177                     break
178             step += 1
179 
180 
181 
182 def crack_captcha(captcha_image):
183    output = crack_captcha_cnn()
184 
185    saver = tf.train.Saver()
186    with tf.Session() as sess:
187       saver.restore(sess, r"F:\pyProgram\verification_code\model\crack_capcha1.model-52000")
188 
189       predict = tf.argmax(tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
190       text_list = sess.run(predict, feed_dict={X: [captcha_image], keep_prob: 1})
191       # text_list = sess.run(predict, feed_dict={X: [captcha_image]})
192 
193       text = text_list[0].tolist()
194       vector = np.zeros(MAX_CAPTCHA*CHAR_SET_LEN)
195       i = 0
196       for n in text:
197             vector[i*CHAR_SET_LEN + n] = 1
198             i += 1
199       return vec2text(vector)
200 
201 if __name__ == '__main__':
202     # 圖像大小
203     IMAGE_HEIGHT = 60
204     IMAGE_WIDTH = 160
205     MAX_CAPTCHA = 4
206     print("驗證碼文本字符數", MAX_CAPTCHA)
207     char_set = number + alphabet + ALPHABET
208     # char_set = number
209     CHAR_SET_LEN = len(char_set)
210 
211     X = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT * IMAGE_WIDTH])
212     Y = tf.placeholder(tf.float32, [None, MAX_CAPTCHA * CHAR_SET_LEN])
213     keep_prob = tf.placeholder(tf.float32)  # dropout
214 
215     # train_crack_captcha_cnn()
216     root = tkinter.Tk()
217     root.withdraw()
218     default_dir = r"C:\Users\animator\Desktop"
219     file_path = tkinter.filedialog.askopenfilename(title=u'選擇文件', initialdir=(os.path.expanduser(default_dir)))
220     image = Image.open(file_path)
221     plt.imshow(image)
222     image = np.array(image)
223     image = convert2gray(image)
224     image = image.flatten() / 255
225     print("預測結果："+crack_captcha(image))
226     plt.show()

爲了防止錯過最佳訓練模型，對每一個訓練模型都進行了保存，後來在出現92.25%的訓練結果後手動結束了程序，訓練過程以下：多線程

可見第52000次訓練結果0.9225app

刪掉多餘的訓練模型後剩餘框架

測試結果（圖片位於桌面，百度隨機下載的一張驗證碼圖片）dom

圖片中驗證碼爲1TjV，可是識別結果爲1TJY，大小寫識別和部分易混淆字母識別不許確ide

解決思路：因爲神經網絡訓練模型單個數字或字母的識別率很高，且模型訓練較容易，能夠增長圖片的分割操做將全部字符一個一個的分開識別，這樣的好處一是提升識別精度，二是避免驗證碼字符個數限制，能夠識別任意個字符組成的驗證碼

目前只有思路，時間問題還未具體實踐

java的學生管理系統樣卷內容和小學期進行的學生管理系統差異不大，用Java代碼實現起來也很是簡單，這裏再也不列出其代碼

除此以外，還實驗了支持下載隊列的多線程網絡爬蟲，代碼以下：

 1 from urllib import request
 2 import re
 3 from bs4 import BeautifulSoup
 4 from time import ctime,sleep
 5 import os,sys,io
 6 import threading
 7 # 在當前目錄建立一個urls子目錄，用於保存下載的HTML文件
 8 os.makedirs('urls',exist_ok=True)
 9 # 下載隊列，入口點的URL會做爲下載隊列的第一個元素，這裏以「極客教程」網站爲例
10 insertURL = ["https://geekori.com"]
11 # 已經處理完的URL會添加到這個隊列中
12 delURL = []
13 # 負責下載和分析的HTML代碼的函數，該函數會在多個線程中執行
14 def getURL():
15     while(1):
16         global insertURL
17         global delURL
18         try:
19             if len(insertURL)>0:
20                 # 從隊列頭取一個URL
21                 html = request.urlopen(insertURL[0]).read()
22                 soup = BeautifulSoup(html,'lxml')
23                 # 開始分析HTML代碼
24                 title = soup.find(name='title').get_text().replace('\n','')
25                 fp = open("./urls/"+str(title)+".html","w",encoding='utf-8')
26                 # 將HTML代碼保存到相應的文件中
27                 fp.write(str(html.decode('utf-8')))
28                 fp.close()
29                 # 開始查找全部的a標籤
30                 href_ = soup.find_all(name='a')
31                 # 對全部的a標籤進行迭代
32                 for each in href_:
33                     urlStr = each.get('href')
34                     if str(urlStr)[:4]=='http' and urlStr not in insertURL:
35                         # 添加全部以http開頭而且沒有處理過的URL
36                         insertURL.append(urlStr)
37                         print(urlStr)
38                 # 將處理完的URL添加到delURL隊列中
39                 delURL.append(insertURL[0])
40                 # 刪除inserURL中處理完的URL
41                 del insertURL[0]
42         except:
43             delURL.append(insertURL[0])
44             del insertURL[0]
45             continue
46         sleep(2)
47 # 下面的代碼啓動了三個線程運行getURL函數
48 threads = []
49 t1 = threading.Thread(target=getURL)
50 threads.append(t1)
51 t2 = threading.Thread(target=getURL)
52 threads.append(t2)
53 t3 = threading.Thread(target=getURL)
54 threads.append(t3)
55 
56 if __name__=='__main__':
57     for t in threads:
58         t.setDaemon(True)
59         t.start()
60     for tt in threads:
61         tt.join()