python 爬取bilibili 視頻信息

抓包時發現子菜單請求數據時通常須要rid,但的確存在一些如遊戲->遊戲賽事不使用rid,對於這種未進行處理,此外rid通常在主菜單的響應中,但有的如番劇這種,rid在子菜單的url中,此外返回的data中含有頁數相關信息,能夠據此定義爬取的頁面數量html

 1 # -*- coding: utf-8 -*-
 2 # @author: Tele
 3 # @Time : 2019/04/08 下午 1:01
 4 import requests  5 import json  6 import os  7 import re  8 import shutil  9 from lxml import etree  10 
 11 
 12 # 爬取每一個菜單的前5頁內容
 13 class BiliSplider:  14     def __init__(self, save_dir, menu_list):  15         self.target = menu_list  16         self.url_temp = "https://www.bilibili.com/"
 17         self.headers = {  18             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",  19          #  "Cookie": "LIVE_BUVID=AUTO6715546997211617; buvid3=07192BD6-2288-4BA5-9259-8E0BF6381C9347193infoc; stardustvideo=1; CURRENT_FNVAL=16; sid=l0fnfa5e; rpdid=bfAHHkDF:cq6flbmZ:Ohzhw:1Hdog8",  20  }  21         self.proxies = {  22             "http": "http://61.190.102.50:15845"
 23  }  24         self.father_dir = save_dir  25 
 26     def get_menu_list(self):  27         regex = re.compile("//")  28         response = requests.get(self.url_temp, headers=self.headers)  29         html_element = etree.HTML(response.content)  30         nav_menu_list = html_element.xpath("//div[@id='primary_menu']/ul[@class='nav-menu']/li/a")  31 
 32         menu_list = list()  33         for item in nav_menu_list:  34             menu = dict()  35             title = item.xpath("./*/text()")  36             menu["title"] = title[0] if len(title) > 0 else None  37             href = item.xpath("./@href")  38             menu["href"] = "https://" + regex.sub("", href[0]) if len(href) > 0 else None  39 
 40             # 子菜單
 41             submenu_list = list()  42             sub_nav_list = item.xpath("./../ul[@class='sub-nav']/li")  43             if len(sub_nav_list) > 0:  44                 for sub in sub_nav_list:  45                     submenu = dict()  46                     sub_title = sub.xpath("./a/span/text()")  47                     submenu["title"] = sub_title[0] if len(sub_title) > 0 else None  48                     sub_href = sub.xpath("./a/@href")  49                     submenu["href"] = "https://" + regex.sub("", sub_href[0]) if len(sub_href) > 0 else None  50  submenu_list.append(submenu)  51             menu["submenu_list"] = submenu_list if len(submenu_list) > 0 else None  52  menu_list.append(menu)  53         return menu_list  54 
 55     # rid=tid
 56     def parse_index_url(self, url):  57         result_list = list()  58         # 正則匹配
 59         regex = re.compile("<script>window.__INITIAL_STATE__=(.*)</script>")  60         response = requests.get(url, headers=self.headers)  61         result = regex.findall(response.content.decode())  62         temp = re.compile("(.*);\(function").findall(result[0]) if len(result) > 0 else None  63         sub_list = json.loads(temp[0])["config"]["sub"] if temp else list()  64         if len(sub_list) > 0:  65             for sub in sub_list:  66                 # 一些子菜單沒有rid,須要請求不一樣的url,暫不處理
 67                 if "tid" in sub:  68                     if sub["tid"]:  69                         sub_menu = dict()  70                         sub_menu["rid"] = sub["tid"] if sub["tid"] else None  71                         sub_menu["title"] = sub["name"] if sub["name"] else None  72  result_list.append(sub_menu)  73                 else:  74                     pass
 75 
 76         return result_list  77 
 78     # 最新動態 region?callback
 79     # 數據 newlist?callback
 80     def parse_sub_url(self, item):  81         self.headers["Referer"] = item["referer"]  82         url_pattern = "https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20"
 83 
 84         # 每一個菜單爬取前5頁
 85         for i in range(1, 6):  86             data = dict()  87             url = url_pattern.format(item["rid"], i)  88             print(url)  89             try:  90                 response = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10)  91             except:  92                 return
 93             if response.status_code == 200:  94                 data["content"] = json.loads(response.content.decode())["data"]  95                 data["title"] = item["title"]  96                 data["index"] = i  97                 data["menu"] = item["menu"]  98                 # 保存數據
 99  self.save_data(data) 100             else: 101                 print("請求超時")  # 通常是403,被封IP了
102 
103     def save_data(self, data): 104         if len(data["content"]) == 0: 105             return
106         parent_path = self.father_dir + "/" + data["menu"] + "/" + data["title"] 107         if not os.path.exists(parent_path): 108  os.makedirs(parent_path) 109         file_dir = parent_path + "/" + "" + str(data["index"]) + "頁.txt"
110 
111         # 保存
112         with open(file_dir, "w", encoding="utf-8") as file: 113             file.write(json.dumps(data["content"], ensure_ascii=False, indent=2)) 114 
115     def run(self): 116         # 清除以前保存的數據
117         if os.path.exists(self.father_dir): 118  shutil.rmtree(self.father_dir) 119 
120         menu_list = self.get_menu_list() 121         menu_info = list() 122         # 得到目標菜單信息
123         # 特殊列表,一些菜單的rid必須從子菜單的url中得到
124         special_list = list() 125         for menu in menu_list: 126             for t in self.target: 127                 if menu["title"] == t: 128                     if menu["title"] == "番劇" or menu["title"] == "國創" or menu["title"] == "影視": 129  special_list.append(menu) 130  menu_info.append(menu) 131                     break
132 
133         # 目標菜單的主頁
134         if len(menu_info) > 0: 135             for info in menu_info: 136                 menu_index_url = info["href"] 137                 # 處理特殊列表
138                 if info in special_list: 139                     menu_index_url = info["submenu_list"][0]["href"] 140                 # 得到rid
141                 result_list = self.parse_index_url(menu_index_url) 142                 print(result_list) 143                 if len(result_list) > 0: 144                     for item in result_list: 145                         # 大菜單
146                         item["menu"] = info["title"] 147                         item["referer"] = menu_index_url 148                         # 爬取子菜單
149  self.parse_sub_url(item) 150 
151 
152 def main(): 153     target = ["動畫", "番劇", "國創", "音樂", "舞蹈", "遊戲", "科技", "數碼", "生活", "鬼畜", "時尚", "廣告", "娛樂", "影視"] 154     splider = BiliSplider("f:/bili_splider", target) 155  splider.run() 156 
157 
158 if __name__ == '__main__': 159     main()

 能夠看到番劇少了新番時間表與番劇索引,由於這兩個請求不遵循https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20的格式,相似的再也不贅述web

 

相關文章
相關標籤/搜索