import time import requests from multiprocessing.dummy import Pool start_time = time.time() def get_page(url): print("正在下載:", url) response = requests.get(url) time.sleep(3) print("下載完成", url) return {'url': url, 'content': response.text} urls = [ 'http://www.jd.com', 'https://www.baidu.com', 'https://www.python.org' ] if __name__ == '__main__': # 實例化一個線程對象 pool = Pool(4) # 將列表中每個列表元素傳給get_page進行處理 pool.map(get_page, urls)
import time, re import requests from multiprocessing.dummy import Pool from lxml import etree # 線程池處理的是阻塞較爲耗時的操做 start_time = time.time() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } # 對下述url發起請求解析出視頻詳情頁的url和視屏名稱 url = "https://www.pearvideo.com/category_5" page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') urls = [] # 存儲全部視頻的鏈接 for li in li_list: detail_url = "https://www.pearvideo.com/" + li.xpath("./div/a/@href")[0] name = li.xpath('./div/a/div[2]/text()')[0] + ".mp4" print(detail_url, name) # 對詳情頁的url發起請求 detail_page_text = requests.get(url=detail_url, headers=headers).text # 從詳情頁中解析出視頻的地址 該視頻地址在js中 ex = 'srcUrl="(.*?)",vdoUrl' video_url = re.findall(ex, detail_page_text)[0] urls.append({'name': name, "url": video_url}) def get_video_data(data): url = data['url'] video_name = data['name'] print(video_name, "正在下載") video_data = requests.get(url=url, headers=headers).content # 持久化存儲操做 with open(video_name, 'wb') as fp: fp.write(video_data) print(video_name, "下載成功!") # 使用線程池對視頻數據進行請求(較爲耗時的阻塞操做) pool = Pool(4) pool.map(get_video_data, urls) pool.close() pool.join()
event_loop: 事件循環, 至關於一個無限循環, 咱們能夠把一些函數註冊到這個事件循環上,當知足某些條件的時候,函數就會被循環執行
coroutine:協程對象, 咱們能夠將協程對象註冊到事件循環中, 它會被事件循環調用。咱們可使用async關鍵字來定義一個方法, 這個方法在調用時不會當即被執行,而是返回一個協程對象
task: 任務, 它是對協程對象的進一步封裝, 包含了任務的各個狀態
async: 定義一個協程
await: 用來掛起阻塞方法的執行python
import asyncio async def request(url): print("正在請求的url是", url) print("請求成功", url) # async修飾函數, 調用以後返回一個協程對象 c = request("www.baidu.com") # 建立一個事件循環對象 loop = asyncio.get_event_loop() # 將協程對象註冊到loop, 而後啓動loop loop.run_until_complete(c)
task是對 coroutine 對象的進一步封裝,它裏面相比 coroutine 對象多了運行狀態,好比 running、finished 等,咱們能夠用這些狀態來獲取協程對象的執行狀況。json
import asyncio async def request(url): print("正在請求的url是", url) print("請求成功", url) # async修飾函數, 調用以後返回一個協程對象 c = request("www.baidu.com") loop = asyncio.get_event_loop() # 基於loop建立一個task對象 task = loop.create_task(c) print(task) loop.run_until_complete(task)
另外定義 task 對象還有一種方式,就是直接經過 asyncio 的 ensure_future() 方法,返回結果也是 task 對象,這樣的話咱們就能夠不借助於 loop 來定義,即便咱們尚未聲明 loop 也能夠提早定義好 task 對象,寫法以下:網絡
import asyncio async def request(url): print("正在請求的url是", url) print("請求成功", url) # async修飾函數, 調用以後返回一個協程對象 c = request("www.baidu.com") loop = asyncio.get_event_loop() task = asyncio.ensure_future(c) print(task) loop.run_until_complete(task) print(task)
import asyncio async def request(url): print("正在請求的url是", url) print("請求成功", url) # async修飾函數, 調用以後返回一個協程對象 c = request("www.baidu.com") loop = asyncio.get_event_loop() task = asyncio.ensure_future(c) # 回調函數 def callback_full(task): print(task.result()) # 將回調函數綁定到任務對象中 task.add_done_callback(callback_full) loop.run_until_complete(task)
若是咱們想執行屢次請求應該怎麼辦呢?咱們能夠定義一個 task 列表,而後使用 asyncio 的 wait() 方法便可執行。session
import asyncio import time start = time.time() async def request(url): print("正在下載", url) # 在異步協程若是出現了同步模塊相關的代碼, 那麼久沒法實現異步 # time.sleep(2) # 基於異步模塊, 當asyncio中遇到阻塞操做必須進行手動掛起 await asyncio.sleep(2) print("下載完畢", url) return url urls = [ "www.baidu.com", "www.sougou.com", "www.goubanjia.com" ] # 任務列表:存放多個任務對象 stasks = [] for url in urls: c = request(url) task = asyncio.ensure_future(c) stasks.append(task) loop = asyncio.get_event_loop() # 須要將任務列表封裝到wait中 loop.run_until_complete(asyncio.wait(stasks)) print(time.time() - start)
import asyncio import time import requests start = time.time() urls = [ 'http://www.jd.com', 'https://www.baidu.com', 'https://www.python.org' ] async def get_page(url): print("正在下載", url) # request.get是基於同步, 必須使用基於異步的網絡請求模塊進行指定url的請求發送 # aiohttp: 基於異步網絡請求模塊 response = requests.get(url=url) print("下載完畢", response.text) tasks = [] for url in urls: c = get_page(url) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) end = time.time() print(end - start)
#環境安裝:pip install aiohttp # 使用該模塊中的ClientSession import asyncio import time import requests import aiohttp start = time.time() urls = [ 'http://www.jd.com', 'https://www.baidu.com', 'https://www.python.org' ] async def get_page(url): print("正在下載", url) # aiohttp: 基於異步網絡請求模塊 async with aiohttp.ClientSession() as session: # get():發起get請求 # post(): 發起post請求 # get和post能夠添加headers(UA假裝)參數, params/data(攜帶參數), proxy="http://ip:port"(代理IP) async with await session.get(url) as response: # text()方法返回字符串形式的響應數據 # read()返回的二進制形式的響應數據 # json()返回josn對象 # 注意:在獲取相應數據以前必定要使用await進行手動掛起 page_text = await response.text() print("下載完畢", page_text) tasks = [] for url in urls: c = get_page(url) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) end = time.time() print(end - start)