代碼以下:html
from gevent import monkey monkey.patch_all() from os import ( path, makedirs ) from time import time from threading import Thread from multiprocessing import Pool as MPool from gevent.pool import Pool import requests from bs4 import BeautifulSoup from functools import wraps def timer(fun): @wraps(fun) def wrapper(*args, **kwargs): self = args[0] start_at = time() f = fun(*args, **kwargs) use_dt = int(time() - start_at) print("一共耗時{}秒".format(use_dt)) print("一共下載{}張照片".format(self.count)) return f return wrapper class DownLoadBaiDu(object): def __init__(self, keyword, full=False, start: int = 0, stop: int = 1): self.full = full self.search_url = "https://tieba.baidu.com/f?ie=utf-8&kw={}&fr=search".format(keyword) self.start = start self.stop = stop self.count = 0 @property def image_path(self): img_path = path.join(path.dirname(__file__), "img") if not path.exists(img_path): makedirs(img_path) return img_path def save_jpg(self, url): print("url:{}".format(url)) _jpg = path.join(self.image_path, path.split(url)[1]) jpg_content = requests.get(url).content with open("{}".format(_jpg), "wb") as f: f.write(jpg_content) self.count += 1 def download_img(self, uri): url = "https://tieba.baidu.com/{}".format(uri) html = requests.get(url).text print("\n帖子連接:{}\n".format(url)) soup = BeautifulSoup(html) imgs = soup.find_all("img") for img in imgs: _img = img.attrs["src"] if _img.startswith("http") and _img.endswith("jpg"): self.save_jpg(url=_img) def get_url_number(self): """獲取搜索結果的id""" numbers = [] h = requests.get(self.search_url).text soup = BeautifulSoup(h) ds = soup.find_all(attrs={"class": "j_th_tit"}) print("一共獲取到{}個帖子".format(len(ds))) if not self.full: ds = ds[self.start: self.stop] for index, i in enumerate(ds): # uri = i.href try: if len(i) == 1: uri = i.attrs["href"] else: uri = i.href except: print(i) raise numbers.append(uri) return numbers @timer def multipr_down(self): """多進程,跑多進程時註釋掉gevent的相關import""" urls = self.get_url_number() p = MPool() for url in urls: p.apply_async(self.download_img, args=(url,)) p.close() p.join() @timer def thread_down(self): """多線程""" urls = self.get_url_number() ts = [] for url in urls: t = Thread(target=self.download_img, args=[url]) ts.append(t) for i in ts: i.start() for i in ts: i.join() @timer def gevent_down(self): """gevent處理""" urls = self.get_url_number() pool = Pool(len(urls)) pool.map(self.download_img, urls) @timer def get(self): """單進程""" numbers = self.get_url_number() for index, number in enumerate(numbers): print("正在下載第{}個帖子的圖片".format(index + 1)) self.download_img(number) if __name__ == '__main__': d = DownLoadBaiDu(keyword="美女", full=True) d.gevent_down() """ 單進程: 一共耗時155秒 一共下載718張照片 多線程: 一共耗時34秒 一共下載866張照片 Gevent: 一共耗時30秒 一共下載770張照片 """