# 如何提高requests模塊爬取數據的效率?
- 多進程或多線程(不建議) 太耗費資源
- 線程池或進程池(適當使用)
- 單線程 + 異步協程(推薦)
# 線程池使用案例 # 梨視頻 下載做業 import random from lxml import etree from multiprocessing.dummy import Pool # 線程 import requests import re url = 'https://www.pearvideo.com/category_3' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) all_video = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href') mp4_list = [] for video in all_video: video_url = 'https://www.pearvideo.com/%s' % video page_video = requests.get(url=video_url, headers=headers).text tree = etree.HTML(page_video) name1 = tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0] mp4_url = re.findall('srcUrl="(.*?)",vdoUrl', page_video, re.S)[0] mp4_list.append(mp4_url) pool = Pool(4) # 將耗時嚴重的任務異步處理,實例化一個線程池對象 # 視頻二進制流獲取 def mp4_request(url): return requests.get(url=url, headers=headers).content # 數據持久化存儲 def mp4_save(mp4_data): name = str(random.randint(0,9999))+'.mp4' # 隨機生成name with open("./%s.mp4" % name, 'wb') as f: f.write(mp4_data) print(name, ',download ok') mp4_data_list = pool.map(mp4_request, mp4_list) # 獲取二進制流 pool.map(mp4_save, mp4_data_list) # data持久化存儲 print('Task is OK!') # 任務結束的提醒
pool.close() #關閉線程池
# 下面是 帶真實名字的版本 from lxml import etree from multiprocessing.dummy import Pool import requests import re url = 'https://www.pearvideo.com/category_3' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) all_video = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href') mp4_list = [] for video in all_video: video_url = 'https://www.pearvideo.com/%s' % video page_video = requests.get(url=video_url, headers=headers).text tree = etree.HTML(page_video) name = tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0] mp4_url = re.findall('srcUrl="(.*?)",vdoUrl', page_video, re.S)[0] mp4_list.append({name:mp4_url}) mp4_list = mp4_list[2:] print(mp4_list) pool = Pool(4) # 將耗時的任務異步處理,實例化一個線程池對象 # 視頻二進制流獲取 def mp4_request(url): return {list(url.keys())[0]:requests.get(url=list(url.values())[0], headers=headers).content} # 數據持久化存儲 def mp4_save(mp4_data): with open("./%s.mp4" % list(mp4_data.keys())[0], 'wb') as f: f.write(list(mp4_data.values())[0]) print(list(mp4_data.keys())[0], ',download ok') mp4_data_list = pool.map(mp4_request, mp4_list) # 獲取二進制流 pool.map(mp4_save, mp4_data_list) # data持久化存儲 print('Task is OK!')
# -- 下面內容都是異步的內容 --
# 基本使用 # 異步輪詢的執行 import asyncio async def hello(name): print('hello to:',name) c = hello('zc')#調用 返回協程對象<coroutine協程 object hello at 0x0000000005EDDE08> # 建立一個事件循環對象 loop = asyncio.get_event_loop() # 將協程對象註冊到事件循環中,而後啓動事件循環對象 loop.run_until_complete(c) # 輸出hello to: zc
# task 的使用 單任務協程 import asyncio async def hello(name): print('hello to:',name) c = hello('zc') # 建立一個事件循環對象 loop = asyncio.get_event_loop() # 就協程進行進一步封裝,封裝到了task對象中 task = loop.create_task(c) print(task) loop.run_until_complete(task) print(task)
# future 的使用 import asyncio async def hello(name): print('hello to:',name) c = hello('zc') loop = asyncio.get_event_loop() task = asyncio.ensure_future(c) print(task) loop.run_until_complete(task) print(task)
# furure 綁定回調
import asyncio def callback(task): # 回調函數 print('I am callback', task.result()) #接收task的return async def hello(name): print('hello to:', name) return name c = hello('zc') #建立協程對象 loop = asyncio.get_event_loop() # 建立loop實例 task = asyncio.ensure_future(c) # 任務對象 print(task) task.add_done_callback(callback) # 添加要執行的回調函數 loop.run_until_complete(task) # 當任務設定完成開始執行 print(task)
# 單線程+多任務異步協程 # 支持異步的網絡請求的模塊 pip install aiohttp import asyncio import aiohttp import time async def get_page(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url) as response: page_text = await response.text() # read() 二進制形式的響應數據,json() print('響應數據:',page_text)
# print('ok %s'%url) start = time.time() urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', ] tasks = [] #任務列表 放置多個任務對象 loop = asyncio.get_event_loop() for url in urls: c = get_page(url) #建立協程對象 task = asyncio.ensure_future(c) #建立任務對象 tasks.append(task) #添加到任務列表裏面 loop.run_until_complete(asyncio.wait(tasks)) #將多個任務對象對應的列表註冊到事件循環中 print('總耗時',time.time()-start) # -- 下面是輸出結果 -- # downloading http://127.0.0.1:5000/bobo # downloading http://127.0.0.1:5000/jay # downloading http://127.0.0.1:5000/tom # 下載 ok http://127.0.0.1:5000/bobo # 下載 ok http://127.0.0.1:5000/jay # 下載 ok http://127.0.0.1:5000/tom # 總耗時 2.0021142959594727
# 基於python的flask框架實現的簡單的Web服務器,代碼: from flask import Flask import time app = Flask(__name__) @app.route('/bobo') def index_bobo(): time.sleep(2) return 'Hello bobo' @app.route('/jay') def index_jay(): time.sleep(2) return 'Hello jay' @app.route('/tom') def index_tom(): time.sleep(2) return 'Hello tom' if __name__ == '__main__': app.run(threaded=True)
# 真實網站請求的 高性能異步IO import asyncio import aiohttp import time async def get_page(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url) as response: page_text = await response.text() # read() 二進制形式的響應數據,json() # print('響應數據:',page_text) print('ok %s'%url) start = time.time() urls = [ 'https://baidu.com', 'https://y.qq.com', 'https://www.taobao.com', ] tasks = [] #任務列表 放置多個任務對象 loop = asyncio.get_event_loop() for url in urls: c = get_page(url) task = asyncio.ensure_future(c) tasks.append(task) # 將多個任務對象對應的列表註冊到事件循環中 loop.run_until_complete(asyncio.wait(tasks)) print('總耗時',time.time()-start)
0 and False => 0 0 or False => False