pc版大概有500條記錄,mobile大概是50部,只有熱門的,因此少一點ios
url構造很簡單,主要參數就是page_limit與page_start,每翻一頁,start+=20便可,tag是"美劇"編碼後的結果,直接帶着也能夠,用unquote解碼也能夠,注意headers中必定要帶上referjson
1 import json 2 import requests 3 import math 4 import os 5 import shutil 6 from pprint import pprint 7 from urllib import parse 8
9
10 class DoubanSpliderPC: 11 def __init__(self): 12 self.url = parse.unquote( 13 "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start={}") 14
15 self.headers = { 16 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", 17 "Referer": "https://movie.douban.com/tv/"
18 } 19 self.file_dir = "./douban_american_pc.txt"
20
21
22 def parse_url(self): 23 number = 0 24 while True: 25 url = self.url.format(number) 26 print(url) 27 response = requests.get(url, headers=self.headers) 28 response_dict = json.loads(response.content.decode()) 29 subjects_list = response_dict["subjects"] 30 with open(self.file_dir, "a", encoding="utf-8") as file: 31 for subject in subjects_list: 32 file.write(json.dumps(subject, ensure_ascii=False)) 33 file.write("\r\n") 34 if len(subjects_list) < 20: 35 break
36 number += 20
37
38 def run(self): 39 # 刪除以前保存的數據
40 if os.path.exists(self.file_dir): 41 os.remove(self.file_dir) 42 print("文件已清空") 43 self.parse_url() 44
45
46 def main(): 47 splider = DoubanSpliderPC() 48 splider.run() 49
50 if __name__ == '__main__': 51 main()
moblie相似,不過抓包的時候找那個Item就能夠了api
1 import json 2 import requests 3 import math 4 import os 5 import shutil 6 from pprint import pprint 7
8
9 # 爬取豆瓣的美劇頁面(手機版只有50條)
10 class DouBanSpliderMobile: 11 pageCount = 18
12 total = None 13
14 def __init__(self): 15 self.first_url = "https://m.douban.com/rexxar/api/v2/subject_collection/tv_american/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=1552995446961"
16 self.headers = { 17 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", 18 "Referer": "https://m.douban.com/tv/american"
19 } 20 self.file_dir = "./douban_american_mobile.txt"
21
22 def get_url_list(self): 23 url_list = [] 24 for i in range(math.ceil(DouBanSpliderMobile.total / DouBanSpliderMobile.pageCount)): 25 url = self.first_url.format(i * 18) 26 url_list.append(url) 27 return url_list 28
29 def parse_url(self, url): 30 response = requests.get(url, headers=self.headers) 31 response_dict = json.loads(response.content.decode()) 32 DouBanSpliderMobile.total = int(response_dict["total"]) 33 with open(self.file_dir, "a", encoding="utf-8") as file: 34 json.dump(response_dict["subject_collection_items"], file, ensure_ascii=False, indent=2) 35
36 def run(self): 37 # 解析第一個url,獲取total
38 self.parse_url(self.first_url.format(0)) 39 url_list = self.get_url_list() 40
41 # 刪除以前保存的文件
42 if os.path.exists(self.file_dir): 43 os.remove(self.file_dir) 44
45 for url in url_list: 46 self.parse_url(url) 47
48 def main(): 49 douban_splider = DouBanSpliderMobile() 50 douban_splider.run() 51
52
53 if __name__ == '__main__': 54 main()