爬蟲之單線程多任務異步抓取

時間 2019-11-20

原文原文鏈接

協程

import asyncio
import time
#定義了一個特殊的函數
#特殊：調用後會返回一個協程對象，且函數內部的實現語句不會被當即執行
#建立一個協程對象
# async def test(num):
#     print(num)
#
# c = test(10)
# print(c)


#封裝一個任務對象
# async def test(num):
#     print(num)
#
# c = test(10)
# #根據協程對象封裝了一個任務對象
# task = asyncio.ensure_future(c)
# print(task)

#事件循環對象
async def request(url):
    print('正在請求：',url)
    time.sleep(2)
    print('請求完畢！',url)


c1 = request('www.1.com')

task_A = asyncio.ensure_future(c1)


#建立一個事件循環對象
loop = asyncio.get_event_loop()
#將任務對象註冊到該對象中而且啓動事件循環
loop.run_until_complete(task_A)

任務對象綁定回調

import asyncio
import time

async def request(url):
    print('正在請求：',url)
    time.sleep(2)
    print('請求完畢！',url)

    return url


#定義一個任務對象的回調函數
#task參數表示的就是該函數被綁定的那個任務對象
def task_callback(task):
    print('i am task_callback()')
    print(task.result())
    #task.result()返回的就是任務對象對應的特殊函數內部的返回值

c = request('www.xxx.com')

task = asyncio.ensure_future(c)
task.add_done_callback(task_callback)

loop = asyncio.get_event_loop()
loop.run_until_complete(task)

多任務異步協程

import asyncio
import time
start = time.time()
#在特殊函數內部不能夠出現不支持異步模塊相關的代碼
async def request(url):
    print('正在請求：',url)
    # time.sleep(2)#time模塊是不支持異步
    await asyncio.sleep(2)  #阻塞操做必須使用await關鍵字進行掛起
    print('請求完畢！',url)

    return url

urls = [
    'www.1.com',
    'www.2.com',
    'www.3.com'
]
def task_callback(task):
    print(task.result())

tasks = [] #多任務列表：存放多個任務對象
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(task_callback)
    tasks.append(task)  #將多個任務對象裝在到一個任務列表中

loop = asyncio.get_event_loop()
#多任務註冊
#wait就是將任務列表中的任務對象進行掛起
loop.run_until_complete(asyncio.wait(tasks))

print(time.time()-start)

多任務異步爬蟲

import asyncio
import time
import requests
start = time.time()
#在特殊函數內部不能夠出現不支持異步模塊相關的代碼
async def request(url):
   print('正在請求：',url)
   response = requests.get(url)
   return response.text

urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay'
]

def parse(task):
    page_text = task.result()
    print(page_text+',請求到的數據！！！')

tasks = []
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(parse)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))


print(time.time()-start)

aiohttp使用

# import asyncio
# import time
# import aiohttp
# start = time.time()
在特殊函數內部不能夠出現不支持異步模塊相關的代碼
簡單的基本架構：
async def request(url):
   with aiohttp.ClientSession() as s:
       #s.get/post和requests中的get/post用法幾乎同樣：url，headers，data/prames
       #在s.get中若是使用代理操做：proxy="http://ip：port"
       with s.get(url) as response:
           #獲取字符串形式的響應數據：response.text()
           #獲取byte類型的：response.read()
           page_text = response.text()
           return page_text
在當前架構的基礎上補充細節便可
    細節1：在每個with前加上async關鍵字
    細節2：在get方法前和response.text()前加上await關鍵字進行手動掛起操做
# async def request(url):
   # async with aiohttp.ClientSession() as s:
       s.get/post和requests中的get/post用法幾乎同樣：url，headers，data/prames
       在s.get中若是使用代理操做：proxy="http://ip：port"
       # async with await s.get(url) as response:
           獲取字符串形式的響應數據：response.text()
           獲取byte類型的：response.read()
           # page_text = await response.text()
           # return page_text

urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay',
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay',
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay',
]
# urls = []
# for i in range(500):
    # urls.append('http://127.0.0.1:5000/bobo')
# def parse(task):
    # page_text = task.result()
    # print(page_text+',請求到的數據！！！')

# tasks = []
# for url in urls:
    # c = request(url)
    # task = asyncio.ensure_future(c)
    # task.add_done_callback(parse)
    # tasks.append(task)

# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.wait(tasks))
# print(time.time()-start)

案列

import aiohttp
import asyncio
from lxml import etree

all_titles = []

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'

}
async def request(url):
    async with aiohttp.ClientSession() as s:
        async with await s.get(url,headers=headers) as response:
            page_text = await response.text()
            return page_text

urls = []
url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d'
for page in range(100):
    u_page = page * 30
    new_url = format(url%u_page)
    urls.append(new_url)

tasks = []
def parse(task):
    page_text = task.result()
    page_text = page_text.encode('gb2312').decode('gbk')
    tree = etree.HTML(page_text)
    tr_list = tree.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
    for tr in tr_list:
        title = tr.xpath('./td[2]/a[2]/text()')[0]
        print(title)
        all_titles.append(title)

for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(parse)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))