抓包時發現子菜單請求數據時通常須要rid,但的確存在一些如遊戲->遊戲賽事不使用rid,對於這種未進行處理,此外rid通常在主菜單的響應中,但有的如番劇這種,rid在子菜單的url中,此外返回的data中含有頁數相關信息,能夠據此定義爬取的頁面數量html
1 # -*- coding: utf-8 -*-
2 # @author: Tele
3 # @Time : 2019/04/08 下午 1:01
4 import requests 5 import json 6 import os 7 import re 8 import shutil 9 from lxml import etree 10
11
12 # 爬取每一個菜單的前5頁內容
13 class BiliSplider: 14 def __init__(self, save_dir, menu_list): 15 self.target = menu_list 16 self.url_temp = "https://www.bilibili.com/"
17 self.headers = { 18 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", 19 # "Cookie": "LIVE_BUVID=AUTO6715546997211617; buvid3=07192BD6-2288-4BA5-9259-8E0BF6381C9347193infoc; stardustvideo=1; CURRENT_FNVAL=16; sid=l0fnfa5e; rpdid=bfAHHkDF:cq6flbmZ:Ohzhw:1Hdog8", 20 } 21 self.proxies = { 22 "http": "http://61.190.102.50:15845"
23 } 24 self.father_dir = save_dir 25
26 def get_menu_list(self): 27 regex = re.compile("//") 28 response = requests.get(self.url_temp, headers=self.headers) 29 html_element = etree.HTML(response.content) 30 nav_menu_list = html_element.xpath("//div[@id='primary_menu']/ul[@class='nav-menu']/li/a") 31
32 menu_list = list() 33 for item in nav_menu_list: 34 menu = dict() 35 title = item.xpath("./*/text()") 36 menu["title"] = title[0] if len(title) > 0 else None 37 href = item.xpath("./@href") 38 menu["href"] = "https://" + regex.sub("", href[0]) if len(href) > 0 else None 39
40 # 子菜單
41 submenu_list = list() 42 sub_nav_list = item.xpath("./../ul[@class='sub-nav']/li") 43 if len(sub_nav_list) > 0: 44 for sub in sub_nav_list: 45 submenu = dict() 46 sub_title = sub.xpath("./a/span/text()") 47 submenu["title"] = sub_title[0] if len(sub_title) > 0 else None 48 sub_href = sub.xpath("./a/@href") 49 submenu["href"] = "https://" + regex.sub("", sub_href[0]) if len(sub_href) > 0 else None 50 submenu_list.append(submenu) 51 menu["submenu_list"] = submenu_list if len(submenu_list) > 0 else None 52 menu_list.append(menu) 53 return menu_list 54
55 # rid=tid
56 def parse_index_url(self, url): 57 result_list = list() 58 # 正則匹配
59 regex = re.compile("<script>window.__INITIAL_STATE__=(.*)</script>") 60 response = requests.get(url, headers=self.headers) 61 result = regex.findall(response.content.decode()) 62 temp = re.compile("(.*);\(function").findall(result[0]) if len(result) > 0 else None 63 sub_list = json.loads(temp[0])["config"]["sub"] if temp else list() 64 if len(sub_list) > 0: 65 for sub in sub_list: 66 # 一些子菜單沒有rid,須要請求不一樣的url,暫不處理
67 if "tid" in sub: 68 if sub["tid"]: 69 sub_menu = dict() 70 sub_menu["rid"] = sub["tid"] if sub["tid"] else None 71 sub_menu["title"] = sub["name"] if sub["name"] else None 72 result_list.append(sub_menu) 73 else: 74 pass
75
76 return result_list 77
78 # 最新動態 region?callback
79 # 數據 newlist?callback
80 def parse_sub_url(self, item): 81 self.headers["Referer"] = item["referer"] 82 url_pattern = "https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20"
83
84 # 每一個菜單爬取前5頁
85 for i in range(1, 6): 86 data = dict() 87 url = url_pattern.format(item["rid"], i) 88 print(url) 89 try: 90 response = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10) 91 except: 92 return
93 if response.status_code == 200: 94 data["content"] = json.loads(response.content.decode())["data"] 95 data["title"] = item["title"] 96 data["index"] = i 97 data["menu"] = item["menu"] 98 # 保存數據
99 self.save_data(data) 100 else: 101 print("請求超時") # 通常是403,被封IP了
102
103 def save_data(self, data): 104 if len(data["content"]) == 0: 105 return
106 parent_path = self.father_dir + "/" + data["menu"] + "/" + data["title"] 107 if not os.path.exists(parent_path): 108 os.makedirs(parent_path) 109 file_dir = parent_path + "/" + "第" + str(data["index"]) + "頁.txt"
110
111 # 保存
112 with open(file_dir, "w", encoding="utf-8") as file: 113 file.write(json.dumps(data["content"], ensure_ascii=False, indent=2)) 114
115 def run(self): 116 # 清除以前保存的數據
117 if os.path.exists(self.father_dir): 118 shutil.rmtree(self.father_dir) 119
120 menu_list = self.get_menu_list() 121 menu_info = list() 122 # 得到目標菜單信息
123 # 特殊列表,一些菜單的rid必須從子菜單的url中得到
124 special_list = list() 125 for menu in menu_list: 126 for t in self.target: 127 if menu["title"] == t: 128 if menu["title"] == "番劇" or menu["title"] == "國創" or menu["title"] == "影視": 129 special_list.append(menu) 130 menu_info.append(menu) 131 break
132
133 # 目標菜單的主頁
134 if len(menu_info) > 0: 135 for info in menu_info: 136 menu_index_url = info["href"] 137 # 處理特殊列表
138 if info in special_list: 139 menu_index_url = info["submenu_list"][0]["href"] 140 # 得到rid
141 result_list = self.parse_index_url(menu_index_url) 142 print(result_list) 143 if len(result_list) > 0: 144 for item in result_list: 145 # 大菜單
146 item["menu"] = info["title"] 147 item["referer"] = menu_index_url 148 # 爬取子菜單
149 self.parse_sub_url(item) 150
151
152 def main(): 153 target = ["動畫", "番劇", "國創", "音樂", "舞蹈", "遊戲", "科技", "數碼", "生活", "鬼畜", "時尚", "廣告", "娛樂", "影視"] 154 splider = BiliSplider("f:/bili_splider", target) 155 splider.run() 156
157
158 if __name__ == '__main__': 159 main()
能夠看到番劇少了新番時間表與番劇索引,由於這兩個請求不遵循https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20的格式,相似的再也不贅述web