爬蟲python

時間 2019-11-18
原文原文鏈接
最近看到電影，也看了不少的評論，想了解下大多人對相關電影的評論，正好也在學習Python，就利用其爬蟲的強大能力，這裏利用Python3.6.1html
下面是相關代碼：python
  1 #coding:utf-8
  2 __author__ = 'hang'
  3 
  4 import warnings
  5 warnings.filterwarnings("ignore")
  6 import jieba    #分詞包
  7 import numpy    #numpy計算包
  8 import codecs   #codecs提供的open方法來指定打開的文件的語言編碼，它會在讀取的時候自動轉換爲內部unicode
  9 import re
 10 import pandas as pd
 11 import matplotlib.pyplot as plt
 12 from urllib import request
 13 from bs4 import BeautifulSoup as bs
 14 # %matplotlib inline  (ipython中應用)
 15 # from skimage import data
 16 import matplotlib
 17 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
 18 from wordcloud import WordCloud#詞雲包
 19 
 20 class KetWord:
 21     def __init__(self,name,count):
 22         self.name =name
 23         self.count = count
 24 
 25     def __cmp__(self, other):
 26 
 27         if isinstance(KetWord,other):
 28             if self.count > other.count:
 29                 return 1
 30             elif self.count < other.count:
 31                 return -1
 32             else:
 33                 return 0
 34 
 35     def __str__(self):
 36         return '[name='+ self.name +':count='+ str(self.count) +']'
 37 #分析網頁函數
 38 def getNowPlayingMovie_list():
 39     resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
 40     html_data = resp.read().decode('utf-8')
 41     soup = bs(html_data, 'html.parser')
 42     nowplaying_movie = soup.find_all('div', id='nowplaying')
 43     nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
 44     nowplaying_list = []
 45     for item in nowplaying_movie_list:
 46         nowplaying_dict = {}
 47         nowplaying_dict['id'] = item['data-subject']
 48         for tag_img_item in item.find_all('img'):
 49             nowplaying_dict['name'] = tag_img_item['alt']
 50             nowplaying_list.append(nowplaying_dict)
 51     return nowplaying_list
 52 
 53 #爬取評論函數
 54 def getCommentsById(movieId, pageNum):
 55     eachCommentList = [];
 56     if pageNum>0:
 57          start = (pageNum-1) * 20
 58     else:
 59         return False
 60     requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20'
 61     print(requrl)
 62     resp = request.urlopen(requrl)
 63     html_data = resp.read().decode('utf-8')
 64     soup = bs(html_data, 'html.parser')
 65     comment_div_lits = soup.find_all('div', class_='comment')
 66     for item in comment_div_lits:
 67         if item.find_all('p')[0].string is not None:
 68             eachCommentList.append(item.find_all('p')[0].string)
 69     return eachCommentList
 70 
 71 def main():
 72     #循環獲取第一個電影的前10頁評論
 73     commentList = []
 74     NowPlayingMovie_list = getNowPlayingMovie_list()
 75     print('common=',NowPlayingMovie_list)
 76     #獲取id電影[{'id': '11502973', 'name': '星際特工：千星之城'}, {'id': '25933890', 'name': '極盜車神'}, {'id': '25849480', 'name': '賽車總動員3：極速挑戰'},
 77     # {'id': '26607693', 'name': '敦刻爾克'}, {'id': '26363254', 'name': '戰狼2'}, {'id': '26826398', 'name': '殺破狼·貪狼'}, {'id': '26816086', 'name': '銀魂 真人版'},
 78     #  {'id': '26430107', 'name': '二十二'}, {'id': '26759539', 'name': '十萬個冷笑話2'}, {'id': '26752106', 'name': '黑白迷宮'}, {'id': '26647876', 'name': '地球：神奇的一天'},
 79     #  {'id': '26969037', 'name': '賽爾號大電影6：聖者無敵'}, {'id': '25980443', 'name': '海邊的曼徹斯特'}, {'id': '26760160', 'name': '破·局'},
 80     #  {'id': '27040349', 'name': '二次初戀'}, {'id': '22232939', 'name': '大耳朵圖圖之美食狂想曲'}, {'id': '25857966', 'name': '鮫珠傳'}, {'id': '26698000', 'name': '心理罪'},
 81     # {'id': '26692823', 'name': '建軍大業'}, {'id': '25823277', 'name': '三生三世十里桃花'}, {'id': '2999500', 'name': '七天'}, {'id': '27107261', 'name': '一路向愛'},
 82     # {'id': '25858758', 'name': '俠盜聯盟'}, {'id': '26790961', 'name': '閃光少女'}, {'id': '26991769', 'name': '恐怖畢業照2'}, {'id': '25812712', 'name': '神偷奶爸3'},
 83     #  {'id': '27107265', 'name': '杜麗娘'}]
 84     for i in range(10):
 85         num = i + 1
 86         commentList_temp = getCommentsById(NowPlayingMovie_list[4]['id'], num)
 87         commentList.append(commentList_temp)
 88 
 89     #將列表中的數據轉換爲字符串
 90     comments = ''
 91     for k in range(len(commentList)):
 92         comments = comments + (str(commentList[k])).strip()
 93 
 94     #使用正則表達式去除標點符號
 95     pattern = re.compile(r'[\u4e00-\u9fa5]+')
 96     filterdata = re.findall(pattern, comments)
 97     cleaned_comments = ''.join(filterdata)
 98 
 99     #使用結巴分詞進行中文分詞
100     segment = jieba.lcut(cleaned_comments)
101     words_df=pd.DataFrame({'segment':segment})
102 
103     #去掉停用詞
104     stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
105     words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
106 
107     #統計詞頻
108     words_stat=words_df.groupby(by=['segment'])['segment'].agg({"計數":numpy.size})
109     words_stat=words_stat.reset_index().sort_values(by=["計數"],ascending=False)
110 
111     #用詞雲進行顯示
112     wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80)
113     word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
114 
115     #利用字典存放
116     word_frequence_list = {}
117     x_val = []
118     y_val = []
119     for key in word_frequence:
120         word_frequence_list[str(key)] = word_frequence[key]
121 
122     wordcloud=wordcloud.generate_from_frequencies(word_frequence_list)
123     print(word_frequence_list)
124 
125     # print('x=',x_val)
126     # print('y=',y_val)
127     # map = dict()
128     # for i in range(len(y_val)):
129     #     # key_word = KetWord(x_val[i],y_val[i])
130     #     map[i] = KetWord(x_val[i],y_val[i])
131     # for key in map:
132     #     print('word=',map[key])
133     # plt.plot(x_val,y_val)
134     # plt.show()
135     plt.imshow(wordcloud)
136     #既然是IPython的內置magic函數，那麼在Pycharm中是不會支持的。可是咱們能夠在matplotlib中的pyplot身上下功夫，pyplot不會不提供展現圖像的功能。
137     plt.colorbar()
138     plt.show()
139 
140 #主函數
141 main()
相關標籤/搜索
每日一句
每一个你不满意的现在，都有一个你没有努力的曾经。