基於以前的爬取代碼咱們進行函數的封裝而且加入多線程html
以前的代碼http://www.javashuo.com/article/p-gazlsdjj-do.htmlpython
from concurrent import futures
導入的模塊多線程
ex = futures.ThreadPoolExecutor(max_workers =22) #設置線程個數
函數
ex.submit(方法,方法須要傳入的參數)
url
import os import requests from lxml.html import etree from concurrent import futures #多線程 url = 'http://www.doutula.com/' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',} def img_url_lis(url): response = requests.get(url,headers = headers) response.encoding = 'utf8' response_html = etree.HTML(response.text) img_url_lis = response_html.xpath('.//img/@data-original') return img_url_lis #建立圖片文件夾 img_file_path = os.path.join(os.path.dirname(__file__),'img') if not os.path.exists(img_file_path): # 沒有文件夾名建立文件夾 os.mkdir(img_file_path) print(img_file_path) def dump_one_img(url): name = str(url).split('/')[-1] response = requests.get(url, headers=headers) img_path = os.path.join(img_file_path, name) with open(img_path, 'wb') as fw: fw.write(response.content) def dump_imgs(urls:list): for url in urls: ex = futures.ThreadPoolExecutor(max_workers =22) #多線程 ex.submit(dump_one_img,url) #方法,對象 # dump_one_img(url) def run(): count = 1 while True: if count == 10: count += 1 continue lis = img_url_lis(f'http://www.doutula.com/article/list/?page={count}') if len(lis) == 0: print(count) break dump_imgs(lis) print(f'第{count}頁也就完成') count +=1 if __name__ == '__main__': run()
能夠更加快速的爬取多個內容
線程