1、爬蟲對象html
豆瓣電影裏面喜劇片的排行榜:json
2、代碼以下:api
設置了多個user-agent,模擬成真實的瀏覽器去提取內容:瀏覽器
user = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",\ "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",\ "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",\ "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",\ "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",\ "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",\ "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",\ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",\ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",\ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"] def get_user(): user_id = random.choice(user) return user_id
抓取電影的信息:app
#獲取索引頁 def get_index_page(url): try: headers = {'user-agent':get_user()} response = requests.get(url,headers=headers) response.encoding = 'utf-8' if response.status_code == 200: return response.text return None except RequestException: print('獲取索引頁錯誤') time.sleep(random.random()*3) return get_index_page(url) #解析索引頁 def parse_index_page(url): data_list = get_index_page(url) dataUrl = json.loads(data_list) detail_list = [] if dataUrl: for item in dataUrl: detail_list.append(item['url']) return detail_list #獲取詳情頁 def get_movie_page(url): detailUrl = parse_index_page(url) movie = [] for i in range(len(detailUrl)): html = detailUrl[i] headers = {'user-agent':get_user()} time.sleep(random.random()*3) res = requests.get(url=html,headers=headers) res.encoding = 'utf-8' soup=BeautifulSoup(res.text,"html.parser") movie_dict = {} movie_dict['name'] = soup.find("span", {"property": "v:itemreviewed"}).text movie_dict['evaluate'] = soup.find("span", {"property": "v:votes"}).text.strip( '' ) movie_dict['score'] = soup.find("strong", {"property": "v:average"}).text.strip() movie_dict['director'] = soup.find("a", {"rel": "v:directedBy"}).text movie_dict['region'] = soup.find("span", text="製片國家/地區:").nextSibling.strip() movie_dict['year'] = soup.find("span", {"class": "year"}).text.lstrip("(").rstrip(")") movie.append(movie_dict) return movie
3、把爬的電影數據保存起來dom
部分截圖(一共抓取了600部電影):lua
4、數據分析url
經過此柱形圖能夠發現美國地區居於首位,畢竟美國發展得比較快,電影事業崛起。spa
經過對前三個國家的喜劇片分析,近年來美國拍的喜劇片逐步上升,法國和日本就平平而過。3d
經過分析導演拍喜劇電影的評分和評價數,劉鎮偉導演的喜劇電影應該更多人喜歡看,評價又多評分也高,你們若想看喜劇電影的話,不妨找找劉鎮偉導演的喜劇電影,僅供你們參考。
推薦:這幾部電影評分比較高,看的人也比較多,喜歡看喜劇電影的不妨找這幾部看看。