''' 爬取糗事百科的段子,將內容和鏈接爬取下來,寫入scv 使用技術:多線程,鎖,隊列,xpath,csv ''' import requests import csv from queue import Queue from lxml import etree import threading class Creeper(threading.Thread): def __init__(self,url_queue,content_queue,*args,**kwargs): super().__init__(*args,**kwargs) self.url_queue = url_queue self.content_queue = content_queue def run(self): while True: if self.url_queue.empty(): break url = self.url_queue.get() self.parse_page(url) def parse_page(self,url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"} response = requests.get(url,headers=headers) text = etree.HTML(response.text) divEle = text.xpath('//div[contains(@class,"article block")]') for div in divEle: content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()') new_content = "\n".join(list(map(lambda x:x.replace('\n',''),content))) a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0] self.content_queue.put((new_content,a_url)) class SaveFile(threading.Thread): def __init__(self,content_queue,writer,lock,*args,**kwargs): super().__init__(*args,**kwargs) self.content_queue = content_queue self.writer = writer self.lock = lock def run(self): while True: try: content,link = self.content_queue.get(timeout=30) # 設置超時時間 # 寫入文件必須加鎖 self.lock.acquire() self.writer.writerow((content,link)) self.lock.release() print('保存一條') except: break def main(): url_queue = Queue(100) content_queue = Queue(300) base_url = "https://www.qiushibaike.com/text/page/{}/" gLock = threading.Lock() # 解決寫入中文亂碼 f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="") header = ['content','link'] writer = csv.writer(f) writer.writerow(header) for i in range(1,13): url = base_url.format(i) url_queue.put(url) for i in range(2): c = Creeper(url_queue, content_queue) c.start() for i in range(2): s = SaveFile(content_queue,writer,gLock) s.start() if __name__ == '__main__': main()