# Author:toloy
# 導入隊列包
import queue
# 導入線程包
import threading
# 導入json處理包
import json
# 導入xpath處理包
from lxml import etree
# 導入請求處理包
import requests
class ThreadCrawl(threading.Thread):
'''
定義爬取網頁處理類,從頁碼隊列中取出頁面,拼接url,請求數據,並把數據存到數據隊列中
'''
def __init__(self, threadName, pageQueue, dataQueue):
'''
構造函數
:param threadName:當前線程名稱
:param pageQueue: 頁面隊列
:param dataQueue: 數據隊列
'''
super(ThreadCrawl, self).__init__()
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
def run(self):
'''
線程執行的run方法
:return:
'''
while not self.pageQueue.empty():
try:
pageNum = self.pageQueue.get(False)
url = "http://www.dfenqi.cn/Product/Category?category=4945805937081335060-0-0&pageIndex=" + str(pageNum)
content = requests.get(url,headers = self.headers).text
self.dataQueue.put(content)
print(self.threadName + '爬取數據')
except:
pass
# 解析線程是否結束的標識
PARSE_THREAD_EXIST = False
class ParseThread(threading.Thread):
'''
網頁數據解析類
'''
def __init__(self,threadName,dataQueue,fileName):
'''
解析線程的構造函數
:param threadName:當前線程名稱
:param dataQueue:數據隊列
:param fileName:文件名
'''
super(ParseThread,self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.fileName = fileName
def run(self):
'''
解析線程的執行run方法,從數據隊列取出數據,解析數據,再存入到本地文件中
:return:
'''
while not PARSE_THREAD_EXIST:
try:
html = self.dataQueue.get(False)
print(self.threadName + '取到數據')
text = etree.HTML(html)
# 解析網頁中的數據,此處可根據須要,定製解析方法或解析類來實現
titleList = text.xpath("//div[@class='liebiao']/ul/li/a/p/text()")
with open(self.fileName,'a',encoding='utf-8') as f:
for title in titleList:
f.write(title + "\n")
print(self.threadName + '解析完成')
except:
pass
def main():
'''
調度方法,main入口執行方法
:return:
'''
# 建立頁碼隊列,用於存儲頁碼
pageQueue = queue.Queue(50)
for i in range(1, 51):
pageQueue.put(i)
# 數據隊列,用於存儲爬取的數據供解析線程使用
dataQueue = queue.Queue()
# 將解析結果保存的文件名
fileName = 'file.txt'
# 爬取線程的名稱列表
cawlThreadNameList = ['線程一', '線程二']
crawThreadlList = []
for threadName in cawlThreadNameList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
crawThreadlList.append(thread)
# 解析線程的名稱列表
parseThreadNameList = ['解析線程一','解析線程二']
parseThreadList = []
for threadName in parseThreadNameList:
thread = ParseThread(threadName,dataQueue,fileName)
thread.start()
parseThreadList.append(thread)
# 等待爬取線程處理結束
for thread in crawThreadlList:
thread.join()
# 判斷數據隊列中是否處理完成了,當處理完成後,將退出解析線程標識爲True
if pageQueue.empty():
global PARSE_THREAD_EXIST
PARSE_THREAD_EXIST = True
# 等待解析線程退出
for thread in parseThreadList:
thread.join()
if __name__ == "__main__":
main()