前言html
爬取豆瓣高分電影《何覺得家》的短評,而後對短評進行jieba分詞操做,最後製做雲圖。字體
代碼url
import requests from lxml import etree import jieba import csv from wordcloud import WordCloud def get_data(url): commentset=set() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'} res = requests.get(url, headers=headers).text html=etree.HTML(res) uname=html.xpath('//div[@class="avatar"]/a/@title') uurl=html.xpath('//div[@class="avatar"]/a/@href') votenum=html.xpath('//span[@class="votes"]/text()') comment_time=html.xpath('//span[@class="comment-time "]/@title') data_cid=html.xpath('//div[@class="comment-item"]/@data-cid') short_comment=html.xpath('//span[@class="short"]/text()') for i in range(len(uname)): commentset.add(short_comment[i]) write([uname[i],data_cid[i],uurl[i],comment_time[i],votenum[i],short_comment[i]]) return ''.join(commentset).strip() def write(fl): fw.writerow(fl) def cloud(s): content_after = "".join(jieba.cut(s,cut_all=True)) print(content_after) wordcloud = WordCloud( font_path='C:/Windows/Fonts/simfang.ttf', background_color='white', # 背景顏色 width=1000, height=600, max_font_size=50, # 字體大小 min_font_size=10, max_words=1000, ).generate(content_after) image_produce = wordcloud.to_image() image_produce.show() # 展現生成的雲圖, wordcloud.to_file('tn.png') # 保存詞雲 print('詞雲生成完畢') if __name__ == '__main__': f=open('hywj.csv','a+',encoding='utf-8') fw=csv.writer(f) text='' fw.writerow(['name','cid','url','comment_time','votenum','comment']) for i in range(8): url=('https://movie.douban.com/subject/30170448/comments?start={}&limit=20&sort=new_score&status=P'.format(i*20)) text+=get_data(url) cloud(text)
效果截圖spa