python爬蟲爬取校花網視頻,單線程爬取html
爬蟲----爬取校花網視頻,包含多線程版本python
上述兩篇文章都是對校花網視頻的爬取,因爲時間相隔好久了,校花網上的一些視頻已經不存在了,所以上述文章中的代碼在運行時會出現一些異常,本篇文章主要是對上述文章中的代碼進行了優化和異常處理,在次作筆記記錄方便之後查閱,修改以下:多線程
一、添加的異常處理以下紅色部分代碼python爬蟲
1 #-*- coding=utf-8 -*- 2 import re 3 import requests 4 import hashlib 5 import time 6 import os 7 8 header = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36', 10 'Referer':'http://www.xiaohuar.com' 11 } 12 13 def get_index(url): 14 respose = requests.get(url, headers = header) 15 if respose.status_code == 200: 16 return respose.text 17 18 def parse_index(res): 19 urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S) # re.S 把文本信息轉換成1行匹配 20 return urls 21 22 23 def get_detail(urls): 24 for url in urls: 25 if not url.startswith('http'): 26 url='http://www.xiaohuar.com%s' %url 27 result = requests.get(url, headers = header) 28 if result.status_code == 200 : 29 mp4_url_list = re.findall(r'id="media".*?src="(.*?)"', result.text, re.S) 30 if mp4_url_list: 31 mp4_url = mp4_url_list[0] 32 save(mp4_url) 33 34 path = os.getcwd() + '/video/' 35 36 def save(url): 37 try:#下載視頻加異常處理 38 video = requests.get(url, headers = header) 39 except requests.exceptions.RequestException as e : 40 print(repr(e)) 41 return 42 43 if video.status_code == 200: 44 m = hashlib.md5() 45 m.update(url.encode('utf-8')) 46 m.update(str(time.time()).encode('utf-8')) 47 filename = r'%s.mp4' % m.hexdigest() 48 filepath = path + filename 49 print(filepath) 50 with open(filepath, 'wb') as f: 51 f.write(video.content) 52 else: 53 print(f'視頻不存在了:{url}') 54 55 def main(): 56 for i in range(5): 57 res1 = get_index('http://www.xiaohuar.com/list-3-%s.html' % i )#拿第一頁數據 58 res2 = parse_index(res1)#提取第一頁上的全部url 59 get_detail(res2)#下載url集合上的視頻 60 61 if __name__ == '__main__': 62 main()
1 #-*- coding=utf-8 -*- 2 # 異步,多線程優化下載速度 3 4 import requests 5 import re 6 import os 7 import hashlib,time 8 from concurrent.futures import ThreadPoolExecutor 9 10 p = ThreadPoolExecutor(30) 11 12 def get_index(url): 13 response = requests.get(url) 14 if response.status_code == 200: 15 return response.text 16 17 def parse_index(res): 18 res = res.result() 19 urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S) 20 21 p.submit(get_detail, urls) 22 23 def get_detail(urls): 24 for url in urls: 25 if not url.startswith('http'): 26 url='http://www.xiaohuar.com%s' %url 27 r1=requests.get(url) 28 if r1.status_code == 200: 29 url_list=re.findall(r'id="media".*?src="(.*?)"', r1.text, re.S) 30 if url_list: 31 mp4_url = url_list[0] 32 save(mp4_url) 33 34 path = os.getcwd() + '/video_mutil/' 35 if not os.path.exists(path): 36 os.makedirs(path) 37 38 def save(url): 39 try:#下載視頻作異常處理,視頻可能不存在了 40 r2 = requests.get(url) 41 except requests.exceptions.RequestException as e : 42 print(repr(e)) 43 return 44 45 if r2.status_code == 200: 46 m=hashlib.md5() 47 m.update(url.encode('utf-8')) 48 m.update(str(time.time()).encode('utf-8')) 49 filename = '%s.mp4' %m.hexdigest() 50 file_path = path + filename 51 with open(file_path,'wb') as f: 52 f.write(r2.content) 53 print('視頻下載完成:%s' % file_path) 54 else: 55 print(f'視頻不存在了:{url}') 56 57 def main(): 58 for i in range(5): 59 p.submit(get_index, 'http://www.xiaohuar.com/list-3-%s.html' % i).add_done_callback(parse_index) 60 61 if __name__ == '__main__': 62 main()
資源下載地址:Python爬取校花網視頻-單線程和多線程版本異步
轉載聲明:本站文章無特別說明,皆爲原創,版權全部,轉載請註明:朝十晚八ide