pip install git+https://github.com/gaojiuli/ashttp
python
AsHTTP 是一款面向將來的,純異步的 HTTP 庫,比aiohttp更快,由於解析器採用了httptools,以最快的速度發送最多的請求。同時內置了HTML解析模塊,讓爬蟲開發快速、運行更快速。git
import time
import ashttp as http
movie_list = []
async def get_list(url):
r = await http.get(url)
html = await r.html()
for l in html.find("a.ulink"):
movie_list.append({
"link": l.href,
"title": l.text
})
async def get_detail(movie):
r = await http.get(movie['link'])
html = await r.html()
download_link = html.find_one("#Zoom table a")
movie.update({"download_link": download_link.text})
async def main():
print("Downloading...")
tasks = [get_list(f"https://www.ygdy8.net/html/gndy/dyzz/list_23_{i + 1}.html") for i in range(197)]
await asyncio.gather(*tasks)
tasks = [get_detail(i) for i in movie_list]
await asyncio.gather(*tasks)
print("Down!")
if __name__ == '__main__':
import asyncio
loop = asyncio.get_event_loop()
sem = asyncio.Semaphore(500) # limit concurrent
s = time.time()
loop.run_until_complete(main())
loop.close()
print("Time Usage", time.time() - s)
print("Movie Count:", len(movie_list))
print()
print("First Movie:", movie_list[0])
複製代碼
Downloading...
Down!
Time Usage 50.032444477081299
Movie Count: 4913
複製代碼