基於線程池的視頻爬取 -- 爬蟲

沒有用線程池進行的視頻爬取代碼以下:多線程

 1 from lxml import etree
 2 from uuid import uuid4
 3 from urllib import request
 4 import requests
 5 import re
 6 
 7 url = 'https://www.pearvideo.com/category_3'
 8 headers = {
 9     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
10 }
11 page_text = requests.get(url=url, headers=headers).text
12 # 解析:詳情url  視頻的標題
13 tree = etree.HTML(page_text)
14 li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
15 for li in li_list:
16     filename = uuid4()
17     detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
18     detail_text = requests.get(url=detail_url, headers=headers).text
19     # 解析:視頻的url
20     exp ='srcUrl=\"(.*?)\"'
21     video_url = re.findall(exp, detail_text, re.S)[0]
22     request.urlretrieve(url=video_url, filename=f'{filename}.mp4')

基於線程池的視頻爬取,爬取梨視頻中的視頻數據併發

  使用併發機制進行多線程的數據下載:數據下載和io存儲中使用併發機制app

 1 from lxml import etree
 2 from uuid import uuid4
 3 from urllib import request
 4 from multiprocessing.dummy import Pool
 5 import requests
 6 import re
 7 
 8 # 實例化一個線程池對象, 參數表示的是開啓線程的個
 9 pool = Pool(10)
10 url = 'https://www.pearvideo.com/category_3'
11 headers = {
12     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
13 }
14 page_text = requests.get(url=url, headers=headers).text
15 # 解析:詳情url  視頻的標題
16 tree = etree.HTML(page_text)
17 li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
18 video_url_list = []
19 
20 for li in li_list:
21     filename = uuid4()
22 
23     detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
24     detail_text = requests.get(url=detail_url, headers=headers).text
25     # 解析:視頻的url
26     exp = 'srcUrl=\"(.*?)\"'
27     video_url = re.findall(exp, detail_text, re.S)[0]
28     # 將解析到的視頻的url存儲到video_url_list列表中
29     video_url_list.append(video_url)
30     # 使用線程池進行視頻數據的併發下載
31     pool.map(lambda u: request.urlretrieve(url=u, filename=f'{filename}.mp4'),
32              video_url_list)
33 pool.close()
34 pool.join()
相關文章
相關標籤/搜索