最近看到電影,也看了不少的評論,想了解下大多人對相關電影的評論,正好也在學習Python,就利用其爬蟲的強大能力,這裏利用Python3.6.1html
下面是相關代碼:python
1 #coding:utf-8 2 __author__ = 'hang' 3 4 import warnings 5 warnings.filterwarnings("ignore") 6 import jieba #分詞包 7 import numpy #numpy計算包 8 import codecs #codecs提供的open方法來指定打開的文件的語言編碼,它會在讀取的時候自動轉換爲內部unicode 9 import re 10 import pandas as pd 11 import matplotlib.pyplot as plt 12 from urllib import request 13 from bs4 import BeautifulSoup as bs 14 # %matplotlib inline (ipython中應用) 15 # from skimage import data 16 import matplotlib 17 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) 18 from wordcloud import WordCloud#詞雲包 19 20 class KetWord: 21 def __init__(self,name,count): 22 self.name =name 23 self.count = count 24 25 def __cmp__(self, other): 26 27 if isinstance(KetWord,other): 28 if self.count > other.count: 29 return 1 30 elif self.count < other.count: 31 return -1 32 else: 33 return 0 34 35 def __str__(self): 36 return '[name='+ self.name +':count='+ str(self.count) +']' 37 #分析網頁函數 38 def getNowPlayingMovie_list(): 39 resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/') 40 html_data = resp.read().decode('utf-8') 41 soup = bs(html_data, 'html.parser') 42 nowplaying_movie = soup.find_all('div', id='nowplaying') 43 nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 44 nowplaying_list = [] 45 for item in nowplaying_movie_list: 46 nowplaying_dict = {} 47 nowplaying_dict['id'] = item['data-subject'] 48 for tag_img_item in item.find_all('img'): 49 nowplaying_dict['name'] = tag_img_item['alt'] 50 nowplaying_list.append(nowplaying_dict) 51 return nowplaying_list 52 53 #爬取評論函數 54 def getCommentsById(movieId, pageNum): 55 eachCommentList = []; 56 if pageNum>0: 57 start = (pageNum-1) * 20 58 else: 59 return False 60 requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20' 61 print(requrl) 62 resp = request.urlopen(requrl) 63 html_data = resp.read().decode('utf-8') 64 soup = bs(html_data, 'html.parser') 65 comment_div_lits = soup.find_all('div', class_='comment') 66 for item in comment_div_lits: 67 if item.find_all('p')[0].string is not None: 68 eachCommentList.append(item.find_all('p')[0].string) 69 return eachCommentList 70 71 def main(): 72 #循環獲取第一個電影的前10頁評論 73 commentList = [] 74 NowPlayingMovie_list = getNowPlayingMovie_list() 75 print('common=',NowPlayingMovie_list) 76 #獲取id電影[{'id': '11502973', 'name': '星際特工:千星之城'}, {'id': '25933890', 'name': '極盜車神'}, {'id': '25849480', 'name': '賽車總動員3:極速挑戰'}, 77 # {'id': '26607693', 'name': '敦刻爾克'}, {'id': '26363254', 'name': '戰狼2'}, {'id': '26826398', 'name': '殺破狼·貪狼'}, {'id': '26816086', 'name': '銀魂 真人版'}, 78 # {'id': '26430107', 'name': '二十二'}, {'id': '26759539', 'name': '十萬個冷笑話2'}, {'id': '26752106', 'name': '黑白迷宮'}, {'id': '26647876', 'name': '地球:神奇的一天'}, 79 # {'id': '26969037', 'name': '賽爾號大電影6:聖者無敵'}, {'id': '25980443', 'name': '海邊的曼徹斯特'}, {'id': '26760160', 'name': '破·局'}, 80 # {'id': '27040349', 'name': '二次初戀'}, {'id': '22232939', 'name': '大耳朵圖圖之美食狂想曲'}, {'id': '25857966', 'name': '鮫珠傳'}, {'id': '26698000', 'name': '心理罪'}, 81 # {'id': '26692823', 'name': '建軍大業'}, {'id': '25823277', 'name': '三生三世十里桃花'}, {'id': '2999500', 'name': '七天'}, {'id': '27107261', 'name': '一路向愛'}, 82 # {'id': '25858758', 'name': '俠盜聯盟'}, {'id': '26790961', 'name': '閃光少女'}, {'id': '26991769', 'name': '恐怖畢業照2'}, {'id': '25812712', 'name': '神偷奶爸3'}, 83 # {'id': '27107265', 'name': '杜麗娘'}] 84 for i in range(10): 85 num = i + 1 86 commentList_temp = getCommentsById(NowPlayingMovie_list[4]['id'], num) 87 commentList.append(commentList_temp) 88 89 #將列表中的數據轉換爲字符串 90 comments = '' 91 for k in range(len(commentList)): 92 comments = comments + (str(commentList[k])).strip() 93 94 #使用正則表達式去除標點符號 95 pattern = re.compile(r'[\u4e00-\u9fa5]+') 96 filterdata = re.findall(pattern, comments) 97 cleaned_comments = ''.join(filterdata) 98 99 #使用結巴分詞進行中文分詞 100 segment = jieba.lcut(cleaned_comments) 101 words_df=pd.DataFrame({'segment':segment}) 102 103 #去掉停用詞 104 stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 105 words_df=words_df[~words_df.segment.isin(stopwords.stopword)] 106 107 #統計詞頻 108 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"計數":numpy.size}) 109 words_stat=words_stat.reset_index().sort_values(by=["計數"],ascending=False) 110 111 #用詞雲進行顯示 112 wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80) 113 word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} 114 115 #利用字典存放 116 word_frequence_list = {} 117 x_val = [] 118 y_val = [] 119 for key in word_frequence: 120 word_frequence_list[str(key)] = word_frequence[key] 121 122 wordcloud=wordcloud.generate_from_frequencies(word_frequence_list) 123 print(word_frequence_list) 124 125 # print('x=',x_val) 126 # print('y=',y_val) 127 # map = dict() 128 # for i in range(len(y_val)): 129 # # key_word = KetWord(x_val[i],y_val[i]) 130 # map[i] = KetWord(x_val[i],y_val[i]) 131 # for key in map: 132 # print('word=',map[key]) 133 # plt.plot(x_val,y_val) 134 # plt.show() 135 plt.imshow(wordcloud) 136 #既然是IPython的內置magic函數,那麼在Pycharm中是不會支持的。可是咱們能夠在matplotlib中的pyplot身上下功夫,pyplot不會不提供展現圖像的功能。 137 plt.colorbar() 138 plt.show() 139 140 #主函數 141 main()