爬取豆瓣電影《何覺得家》評論

前言html

  爬取豆瓣高分電影《何覺得家》的短評,而後對短評進行jieba分詞操做,最後製做雲圖。字體

 

代碼url

import requests
from lxml import etree
import jieba
import csv
from wordcloud import WordCloud


def get_data(url):
    commentset=set()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'}
    res = requests.get(url, headers=headers).text
    html=etree.HTML(res)
    uname=html.xpath('//div[@class="avatar"]/a/@title')
    uurl=html.xpath('//div[@class="avatar"]/a/@href')
    votenum=html.xpath('//span[@class="votes"]/text()')
    comment_time=html.xpath('//span[@class="comment-time "]/@title')
    data_cid=html.xpath('//div[@class="comment-item"]/@data-cid')
    short_comment=html.xpath('//span[@class="short"]/text()')
    for i in range(len(uname)):
        commentset.add(short_comment[i])
        write([uname[i],data_cid[i],uurl[i],comment_time[i],votenum[i],short_comment[i]])
    return ''.join(commentset).strip()

def write(fl):
    fw.writerow(fl)

def cloud(s):
    content_after = "".join(jieba.cut(s,cut_all=True))
    print(content_after)
    wordcloud = WordCloud(
        font_path='C:/Windows/Fonts/simfang.ttf',
        background_color='white',  # 背景顏色
        width=1000,
        height=600,
        max_font_size=50,  # 字體大小
        min_font_size=10,
        max_words=1000,

    ).generate(content_after)
    image_produce = wordcloud.to_image()
    image_produce.show()  # 展現生成的雲圖,
    wordcloud.to_file('tn.png')  # 保存詞雲
    print('詞雲生成完畢')

if __name__ == '__main__':
    f=open('hywj.csv','a+',encoding='utf-8')
    fw=csv.writer(f)
    text=''
    fw.writerow(['name','cid','url','comment_time','votenum','comment'])
    for i in range(8):
        url=('https://movie.douban.com/subject/30170448/comments?start={}&limit=20&sort=new_score&status=P'.format(i*20))
        text+=get_data(url)
    cloud(text)

 

效果截圖spa

相關文章
相關標籤/搜索