八、簡單的多線程爬取網頁數據並經過xpath解析存到本地

時間 2019-12-11

原文原文鏈接

# Author:toloy
# 導入隊列包
import queue
# 導入線程包
import threading
# 導入json處理包
import json
# 導入xpath處理包
from lxml import etree
# 導入請求處理包
import requests

class ThreadCrawl(threading.Thread):
    '''
    定義爬取網頁處理類，從頁碼隊列中取出頁面，拼接url，請求數據，並把數據存到數據隊列中
    '''
    def __init__(self, threadName, pageQueue, dataQueue):
        '''
        構造函數
        :param threadName:當前線程名稱
        :param pageQueue: 頁面隊列
        :param dataQueue: 數據隊列
        '''
        super(ThreadCrawl, self).__init__()
        self.threadName = threadName
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}

    def run(self):
        '''
        線程執行的run方法
        :return:
        '''
        while not self.pageQueue.empty():
            try:
                pageNum = self.pageQueue.get(False)
                url = "http://www.dfenqi.cn/Product/Category?category=4945805937081335060-0-0&pageIndex=" + str(pageNum)
                content = requests.get(url,headers = self.headers).text
                self.dataQueue.put(content)
                print(self.threadName + '爬取數據')
            except:
                pass

# 解析線程是否結束的標識
PARSE_THREAD_EXIST = False

class ParseThread(threading.Thread):
    '''
    網頁數據解析類
    '''
    def __init__(self,threadName,dataQueue,fileName):
        '''
        解析線程的構造函數
        :param threadName:當前線程名稱
        :param dataQueue:數據隊列
        :param fileName:文件名
        '''
        super(ParseThread,self).__init__()
        self.threadName = threadName
        self.dataQueue = dataQueue
        self.fileName = fileName

    def run(self):
        '''
        解析線程的執行run方法，從數據隊列取出數據，解析數據，再存入到本地文件中
        :return:
        '''
        while not PARSE_THREAD_EXIST:
            try:
                html = self.dataQueue.get(False)
                print(self.threadName + '取到數據')
                text = etree.HTML(html)
                # 解析網頁中的數據，此處可根據須要，定製解析方法或解析類來實現
                titleList = text.xpath("//div[@class='liebiao']/ul/li/a/p/text()")
                with open(self.fileName,'a',encoding='utf-8') as f:
                    for title in titleList:
                        f.write(title + "\n")
                print(self.threadName + '解析完成')
            except:
                pass

def main():
    '''
    調度方法，main入口執行方法
    :return:
    '''
    # 建立頁碼隊列，用於存儲頁碼
    pageQueue = queue.Queue(50)
    for i in range(1, 51):
        pageQueue.put(i)
    # 數據隊列，用於存儲爬取的數據供解析線程使用
    dataQueue = queue.Queue()
    # 將解析結果保存的文件名
    fileName = 'file.txt'
    # 爬取線程的名稱列表
    cawlThreadNameList = ['線程一', '線程二']
    crawThreadlList = []
    for threadName in cawlThreadNameList:
        thread = ThreadCrawl(threadName, pageQueue, dataQueue)
        thread.start()
        crawThreadlList.append(thread)

    # 解析線程的名稱列表
    parseThreadNameList = ['解析線程一','解析線程二']
    parseThreadList = []
    for threadName in parseThreadNameList:
        thread = ParseThread(threadName,dataQueue,fileName)
        thread.start()
        parseThreadList.append(thread)
    # 等待爬取線程處理結束
    for thread in crawThreadlList:
        thread.join()
    # 判斷數據隊列中是否處理完成了，當處理完成後，將退出解析線程標識爲True
    if pageQueue.empty():
        global PARSE_THREAD_EXIST
        PARSE_THREAD_EXIST = True
    # 等待解析線程退出
    for thread in parseThreadList:
        thread.join()

if __name__ == "__main__":
    main()