沒有用線程池進行的視頻爬取代碼以下:多線程
1 from lxml import etree 2 from uuid import uuid4 3 from urllib import request 4 import requests 5 import re 6 7 url = 'https://www.pearvideo.com/category_3' 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 10 } 11 page_text = requests.get(url=url, headers=headers).text 12 # 解析:詳情url 視頻的標題 13 tree = etree.HTML(page_text) 14 li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') 15 for li in li_list: 16 filename = uuid4() 17 detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0] 18 detail_text = requests.get(url=detail_url, headers=headers).text 19 # 解析:視頻的url 20 exp ='srcUrl=\"(.*?)\"' 21 video_url = re.findall(exp, detail_text, re.S)[0] 22 request.urlretrieve(url=video_url, filename=f'{filename}.mp4')
基於線程池的視頻爬取,爬取梨視頻中的視頻數據併發
使用併發機制進行多線程的數據下載:數據下載和io存儲中使用併發機制app
1 from lxml import etree 2 from uuid import uuid4 3 from urllib import request 4 from multiprocessing.dummy import Pool 5 import requests 6 import re 7 8 # 實例化一個線程池對象, 參數表示的是開啓線程的個 9 pool = Pool(10) 10 url = 'https://www.pearvideo.com/category_3' 11 headers = { 12 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 13 } 14 page_text = requests.get(url=url, headers=headers).text 15 # 解析:詳情url 視頻的標題 16 tree = etree.HTML(page_text) 17 li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') 18 video_url_list = [] 19 20 for li in li_list: 21 filename = uuid4() 22 23 detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0] 24 detail_text = requests.get(url=detail_url, headers=headers).text 25 # 解析:視頻的url 26 exp = 'srcUrl=\"(.*?)\"' 27 video_url = re.findall(exp, detail_text, re.S)[0] 28 # 將解析到的視頻的url存儲到video_url_list列表中 29 video_url_list.append(video_url) 30 # 使用線程池進行視頻數據的併發下載 31 pool.map(lambda u: request.urlretrieve(url=u, filename=f'{filename}.mp4'), 32 video_url_list) 33 pool.close() 34 pool.join()