class Parse多線程架構圖

時間 2020-09-30

原文原文鏈接

多線程爬蟲

多線程爬蟲
- 今日內容

今日內容

1. 併發與並行(**) 2. 多線程致使數據的不安全(**) --> 理解不了, 那就記住結論(多線程共同操做數據會致使數據不安全) 3. 多線程爬蟲架構(*****) 4. 多線程爬蟲的代碼(*****)

1.併發與並行

1.併發: 在同一時間段內, 全部任務同時運行. 2.並行: 在同一時刻, 全部任務同時執行

2.多線程

i = 0 i += 1 i -= 1 print(i) 多線程共同操做數據會致使數據不安全

3.多線程架構圖

1.url,發請求, 獲取響應 2.數據解析 3.數據持久化

from threading import Thread from threading import Lock from queue import Queue import requests import pymysql from lxml import etree # base_url = 'http://xiaohua.zol.com.cn/youmo/%s.html' # 爬蟲類 class Sqider(Thread): def __init__(self, sname, urlQueue, dataQueue): super().__init__() self.sname = sname self.urlQueue = urlQueue self.dataQueue = dataQueue self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36' } # 爬取數據 def run(self, ): base_url = 'http://xiaohua.zol.com.cn/youmo/%s.html' while 1: # block 表明阻塞，block爲True即爲阻塞。block爲False爲不阻塞 try: print('%s正在爬取數據' % self.name) page = self.urlQueue.get(block=False) res = requests.get(url=base_url % page, headers=self.headers) self.dataQueue.put(res.text) print('%s提交數據完畢--' % self.name) except: break # 解析類 class Parse(Thread): def __init__(self, pname, dataQueue, conn, cursor, lock): super().__init__() self.pname = pname self.dataQueue = dataQueue self.conn = conn self.cursor = cursor self.lock = lock def run(self): # 實現數據解析的過程 print('run方法已經調用') self.parse(self.dataQueue) # 調用parse方法 def parse(self, dQueue): # 實現具體的解析過程 while 1: try: html = dQueue.get() tree = etree.HTML(html) li_list = tree.xpath('//li[@class="article-summary"]') for li in li_list: title = li.xpath('.//span[@class="article-title"]/a/text()')[0] content = li.xpath('.//div[@class="summary-text"]//text()') data = {'title': title, 'content': content} with self.lock: self.save(data) except: break # 存儲數據 def save(self, data): sql = 'insert into joke values ("%s","%s")' % (data['title'], data['content']) try: self.cursor.execute(sql) self.conn.commit() print('%s存儲數據成功' % self.pname) except Exception as e: print(e) self.conn.rollback() # 主函數 defmain():# 盛放url的地方 url隊列 urlQueue =Queue()for page in range(1,101): urlQueue.put(page)# 放置響應數據 dataQueue =Queue()# 開啓爬蟲線程 snames =['爬蟲1號','爬蟲2號','爬蟲3號'] slist =[]for sname in snames:# 實例化線程對象 s =Sqider(sname, urlQueue, dataQueue)# 開啓線程 s.start() slist.append(s)for s in slist: s.join()# 連接數據庫 conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='1234', charset='utf8', database='xiaohua') cursor = conn.cursor()# 解析線程的開啓 lock =Lock()# 上鎖 plist =[] pnl =['解析1號','解析2號','解析3號']for pname in pnl: p =Parse(pname, dataQueue, conn, cursor, lock) p.start() plist.append(p)for p in plist: p.join()# 程序入口if __name__ =='__main__':