import re import redis import requests,time from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL from concurrent.futures import ThreadPoolExecutor from myredis import POOL class CrawlVideo(): pools = ThreadPoolExecutor(100) def __init__(self, page=PAGE): self.page = page self.video_info_dic_list = [] self.conn = redis.Redis(connection_pool=POOL) def async_download(self,video_dic): video_link = video_dic["video_link"] if self.conn.get(video_link): return video_name = video_dic["title"][:3] response = requests.get(video_link) if response.status_code == 200: with open("%s.mp4" % video_name, "wb")as f: f.write(response.content) self.conn.set(video_link,video_link) def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE): crawl_ids_list= self.crawl_videolist(category_id, start, num) print(len(crawl_ids_list)) self.get_video_info(crawl_ids_list) i = 0 while i < len(crawl_ids_list): try: video_dic = self.video_info_dic_list.pop() self.pools.submit(self.async_download,video_dic) i += 1 except Exception as e: time.sleep(0.2) def get_video_ids(self, category_id, start): main_url = MAIN_URL.format(category_id, start) try: response = requests.get(main_url) video_id_list = re.findall('<a href="(video_\d+)"', response.text) return video_id_list except Exception as e: pass # 爬取單個視頻的id的列表,能夠經過此列表發請求 def crawl_videolist(self, category_id, start, num): crawl_ids_list = [] page_num = self.get_page_num(num) for i in range(page_num): video_id_list = self.get_video_ids(category_id, start) crawl_ids_list.extend(video_id_list) start += self.page while len(crawl_ids_list) > num: crawl_ids_list.pop() return crawl_ids_list def get_detail(self, obj): response = obj.result() dic = {} title = re.search('<title>(.*?)</title>', response.text).group(1) video_link = re.search('srcUrl="(.*?)"', response.text).group(1) dic["title"] = title dic["video_link"] = video_link self.video_info_dic_list.append(dic) def async_request(self,url,video_addr): response = requests.get(url.format(video_addr)) return response def get_video_info(self, video_id_list): url = DETAIL_URL try: for video_addr in video_id_list: obj = self.pools.submit(self.async_request,url,video_addr) obj.add_done_callback(self.get_detail) except Exception as e: print(e) def get_page_num(self, num): if num % self.page == 0: page_num = num / self.page elif num <= self.page: page_num = 1 else: page_num = num // self.page + 1 return int(page_num) crawl = CrawlVideo() crawl.download_video(start=1,num=2)