利用beautifulsoup爬取豆瓣電影top250，存儲在mongodb

時間 2020-07-26

標籤利用 beautifulsoup 豆瓣 top250 存儲 mongodb 欄目 MongoDB 简体版

原文原文鏈接

很少說了，上代碼：html

 1 from requests import request  2 from bs4 import BeautifulSoup  3 import re  4 import pymongo  5 
 6 
 7 
 8 class SpiderDouBan:  9 
10 
11 
12     def __init__(self): 13         client = pymongo.MongoClient(host='localhost', port=27017) 14         db = client['spider_db'] 15         self.collection = db['douban_movie_top250'] 16 
17 
18 
19     def get_html(self, url): 20         '''
21  獲取一頁的html文本 22  :param url: 地址 23  :return: 24         '''
25         html = request('get', url).text 26         soup = BeautifulSoup(html, 'lxml') 27         return soup 28 
29 
30     def get_one_page(self, soup, order): 31         '''
32  獲取某一頁的內容 33  :param soup: soup實例化對象 34  :return: 35         '''
36         movie_names = [span.string for span in soup.find_all(name='span', attrs={'class': 'title'}) if not re.search('\/', span.string)] 37         movie_actors = [ re.sub('\n|\xa0', '', p.get_text().strip('" " |\n | \xa0')).split('/') for p in soup.find_all(name='p', attrs={'class': ''})] 38         movie_rates = [span.string for span in soup.find_all(name='span', attrs={'class': 'rating_num'})] 39         comment_num = [span_2.string for span in soup.find_all(attrs={'property': 'v:best'}) for span_2 in span.next_siblings if re.search('\w+', span_2.string)] 40         short_comments = [re.sub('。', '', span.string) for span in soup.find_all(class_='inq')] 41         for index, name in enumerate(movie_names): 42             print(f'正在爬取第{order + index + 1}條數據...') 43             movie_info = { 44                           'order': f'No.{order + index + 1}', 45                           'movie_name': name, 46                           'movie_type': f'{re.findall("[0-9]+", movie_actors[index][-3])[0]}年/{movie_actors[index][-2]}/{movie_actors[index][-1]}', 47                           'movie_rate': f'{movie_rates[index][0]}分', 48                           'short_comment': f'{short_comments[index]}'
49  } 50  self.collection.insert_one(movie_info) 51 
52 
53 
54     def main(self, url, order): 55         '''
56  主程序 57  :return: 58         '''
59         soup = self.get_html(url) 60  self.get_one_page(soup, order) 61 
62 
63 
64 
65 if __name__ == '__main__': 66     for offset in range(0, 250, 25): 67         order = offset 68         url = f'https://movie.douban.com/top250?start={str(offset)}'
69         SpiderDouBan().main(url, order)