<隨便寫> 多線程的例子

'''
	一個線程在使用這個共享的時候,其餘線程必須等待他結束
	經過"鎖"實現,做用就是防止多個線程使用這片內存空間
	進程:程序的一次執行
	線程:cpu運算的基本調度單位
	多線程:大量密集I/O處理,在等待響應的時候,其餘線程去工做
	多進程:大量的密集並行計算
	scrapy:異步網絡框架(不少協程在處理)
	頁碼隊列--線程取頁碼爬取(採集線程--網絡IO)--數據隊列(獲得的響應)--線程解析網頁(解析線程磁盤IO)--解析後的數據存儲
'''
# 請求
import requests
# 隊列
from multiprocessing import Queue
# 線程
from threading import Thread
import threading
# 解析
from lxml import etree
# 存儲
import json
import time


class ThreadCrawl(Thread):
	def __init__(self, threadName, pageQueue, dataQueue):
		# 調用父類的初始化方法
		super(ThreadCrawl, self).__init__()
		self.threadName = threadName
		self.pageQueue = pageQueue
		self.dataQueue = dataQueue
		self.headers = {"User-Agent":"Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36"}

	# thread.start()會執行run方法
	def run(self):
		print("啓動"+self.threadName)
		while not CRAWL_EXIT:
			try:
				# 從頁碼隊列取出一個數字,
				# 可選參數block(默認Ture)
				# 1.隊列爲空,block爲Ture,會進入阻塞狀態,直到有新的值進入隊列
				# 2.若是隊列爲空.block爲False,會彈出Queue.empty()出錯
				page = self.pageQueue.get(False)
				url = "https://www.qiushibaike.com/text/page/" + str(page) + "/"
				content = requests.get(url,headers=self.headers).text
				#調用數據隊列,將源碼放進去
				self.dataQueue.put(content)
			except:
				pass
			print("結束"+self.threadName)

class ThreadParse(Thread):
	def __init__(self,threadName,dataQueue,filename,lock):
		super(ThreadParse,self).__init__()
		self.threadName = threadName
		self.dataQueue = dataQueue
		self.filename = filename
		self.lock = lock

	def run(self):
		while not PARSE_EXIT:
			try:
				html = self.dataQueue.get(False)
				self.parse(html)
			except:
				pass

	def parse(self,html):
		html = etree.HTML(html)
		print(html)

		# with 後面有兩個必須執行的操做:__enter__ 和 _exit__
		# 無論裏面的操做結果如何,都會執行打開、關閉
		# 打開鎖、處理內容、釋放鎖
		with self.lock:
			# 寫入存儲的解析後的數據
			self.filename.write(json.dumps(html, ensure_ascii=False).encode("utf-8") + "\n")


CRAWL_EXIT = False
PARSE_EXIT = False

def main():
	# 頁碼隊列,能夠存儲20個值
	pageQueue = Queue(20)
	# 放入1-10數字,先進先出
	for i in range(1, 21):
		pageQueue.put(i)

	# 數據隊列,HTML源碼,不寫參數,默認無限
	dataQueue = Queue()

	# 建立鎖
	lock = threading.Lock()

	# 採集線程名字
	crawlList = ["採集線程1號", "採集線程2號", "採集線程3號"]

	# 存儲採集線程
	thread_crawl = []
	for threadName in crawlList:
		# 寫一個
		thread = ThreadCrawl(threadName, pageQueue, dataQueue)
		thread.start()
		thread_crawl.append(thread)

	filename = open("duanzi.json","a")
	#解析線程名字
	parseList = ["解析線程1號","解析線程2號","解析線程3號"]
	threadparse = []
	for threadName in parseList:
		thread = ThreadParse(threadName,dataQueue,filename,lock)
		thread.start()
		threadparse.append(thread)



	#若是隊列不爲空,一直在這等待
	while not pageQueue.empty():
		pass
	#若是隊列爲空,CRAWL_EXIT = True 退出
	global CRAWL_EXIT
	CRAWL_EXIT = True

	#加阻塞,線程作完才能運行主線程
	for thread in thread_crawl:
		thread.join()
		print(thread)

	while not dataQueue.empty():
		pass

	global PARSE_EXIT
	PARSE_EXIT = True

	for thread in threadparse:
		thread.join()
		print(thread)

	with lock:
		# 關閉文件
		filename.close()
	print("謝謝使用")

if __name__ == '__main__':
	main()
相關文章
相關標籤/搜索