最近很想看的一個電影,去知乎上看一下評論,恰好在學Python爬蟲,就作個小實例。html
代碼基於第三方修改 原文連接 http://python.jobbole.com/88325/#comment-94754python
#coding:utf-8 from lib2to3.pgen2.grammar import line __author__ = 'hang' import warnings warnings.filterwarnings("ignore") import jieba #分詞包 import numpy #numpy計算包 import re import pandas as pd import matplotlib.pyplot as plt import urllib2 from bs4 import BeautifulSoup as bs import matplotlib matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) from wordcloud import WordCloud#詞雲包 #分析網頁函數 def getNowPlayingMovie_list(): resp = urllib2.urlopen('https://movie.douban.com/nowplaying/hangzhou/') html_data = resp.read().decode('utf-8') soup = bs(html_data, 'html.parser') nowplaying_movie = soup.find_all('div', id='nowplaying') nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') nowplaying_list = [] for item in nowplaying_movie_list: nowplaying_dict = {} nowplaying_dict['id'] = item['data-subject'] for tag_img_item in item.find_all('img'): nowplaying_dict['name'] = tag_img_item['alt'] nowplaying_list.append(nowplaying_dict) return nowplaying_list #爬取評論函數 def getCommentsById(movieId, pageNum): eachCommentStr = '' if pageNum>0: start = (pageNum-1) * 20 else: return False requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20' print(requrl) resp = urllib2.urlopen(requrl) html_data = resp.read() soup = bs(html_data, 'html.parser') comment_div_lits = soup.find_all('div', class_='comment') for item in comment_div_lits: if item.find_all('p')[0].string is not None: eachCommentStr+=item.find_all('p')[0].string return eachCommentStr.strip() def main(): #循環獲取第一個電影的前10頁評論 commentStr = '' NowPlayingMovie_list = getNowPlayingMovie_list() for i in range(10): num = i + 1 commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num) commentStr+=commentList_temp.strip() #print comments cleaned_comments = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】《》<>,「」!,...。?、~@#¥%……&*()]+", "",commentStr) print cleaned_comments #使用結巴分詞進行中文分詞 segment = jieba.lcut(cleaned_comments) words_df=pd.DataFrame({'segment':segment}) #去掉停用詞 stopwords=pd.read_csv("D:\pycode\stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 words_df=words_df[~words_df.segment.isin(stopwords.stopword)] print words_df #統計詞頻 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"計數":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["計數"],ascending=False) #用詞雲進行顯示 wordcloud=WordCloud(font_path="D:\pycode\simhei.ttf",background_color="white",max_font_size=80) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} word_frequence_list = [] for key in word_frequence: temp = (key,word_frequence[key]) word_frequence_list.append(temp) wordcloud = wordcloud.fit_words(dict(word_frequence_list)) plt.imshow(wordcloud) plt.axis("off") plt.show() #主函數 main()