蒂姆·古德曼(賈斯提斯·史密斯 飾) 爲尋找下落不明的父親來到萊姆市,意外與父親的前寶可夢搭檔大偵探皮卡丘(瑞恩·雷諾茲 配音)相遇,並驚訝地發現本身是惟一能聽懂皮卡丘說話的人類,他們決定組隊踏上揭開真相的刺激冒險之路。探案過程當中他們邂逅了各式各樣的寶可夢,並意外發現了一個足以毀滅整個寶可夢宇宙的驚天陰謀。html
爬取評論部分的用戶ID、用戶名、評論、評分、時間五項。web
爬取的json數據切入口:http://m.maoyan.com/mmdb/comments/movie/346629.json?_v_=yes&offset=0&startTime=2019-05-09%2022%3A25%3A03sql
這部電影除去未知性別的,在已知性別的評論者男性的比例比較多,說明這部電影男性的數據庫
愛好者比較多。json
根據上面分餅圖可得滿分的佔了70%左右,4.5分以上佔了7.4%左右,可知這部電影的app
評價十分高,應該是很是好看的,值得去觀看。echarts
對於這次影評的分析,能夠看出在即將上映的前夕,大部分影迷對於這部電影懷抱着回憶童年的心態,皮卡丘的名字被大多數人說起,證實絕大部分羣體應該都觀看過寵物小精靈,決大部分人對這部電影充滿了期待,從城市分佈能夠看出觀影羣體主要以一二線城市爲主。dom
import requestsfrom bs4 import BeautifulSoupfrom datetime import datetimeimport reimport sqlite3import pandas as pdimport timeimport pandasimport randomimport json#設置合理的user-agent,爬取數據函數def getData(url): headers =[ {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36','Cookie': '_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'}, { 'User-Agent': 'Mozilla / 5.0(Linux;Android 6.0; Nexus 5 Build / MRA58N) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 73.0 .3683.103Mobile Safari / 537.36','Cookie':'_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'}, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10','Cookie':'_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'} ] # proxies = [{'https': 'https://120.83.111.194:9999','http':'http://14.20.235.120:808'},{"http": "http://119.131.90.115:9797", # "https": "https://14.20.235.96:9797"}] get=requests.get(url, headers=headers[random.randint(0,2)]); get.encoding = 'utf-8' return get#數據處理函數def dataProcess(data): data = json.loads(data.text)['cmts'] allData = [] for i in data: dataList = {} dataList['id'] = i['id'] dataList['nickName'] = i['nickName'] dataList['cityName'] = i['cityName'] if 'cityName' in i else '' # 處理cityName不存在的狀況 dataList['content'] = i['content'].replace('\n', ' ', 10) # 處理評論內容換行的狀況 dataList['score'] = i['score'] dataList['startTime'] = i['startTime'] if "gender" in i: dataList['gendar'] = i["gender"] else: dataList['gendar'] = i["gender"] = 0 allData.append(dataList) return allDataallData=[]for i in range(67): get=getData('http://m.maoyan.com/mmdb/comments/movie/346629.json?_v_=yes&offset={}&startTime=2019-05-09%2022%3A25%3A03'.format(i*15)) allData.extend(dataProcess(get))#處理後的數據保存爲csv文件pd.Series(allData)newsdf=pd.DataFrame(allData)newsdf.to_csv('news.csv',encoding='utf-8')# #把csv文件保存到sqlite# newsdf = pd.read_csv('news.csv')# with sqlite3.connect('sqlitetest.sqlite') as db:# newsdf.to_sql('data',con = db)# 評論者性別分佈可視化def sexProcess(gender): from pyecharts import Pie list_num = [] list_num.append(gender.count(0)) # 未知 list_num.append(gender.count(1)) # 男 list_num.append(gender.count(2)) # 女 attr = ["未知","男","女"] pie = Pie("性別餅圖",title_pos="center") pie.add("", attr, list_num,is_label_show=True) pie.render("sex_pie.html")gendar=[]for i in allData: gendar.append(i['gendar'])sexProcess(gendar)# 評論者評分等級環狀餅圖def scoreProcess(scores): from pyecharts import Pie list_num = [] list_num.append(scores.count(0)) list_num.append(scores.count(0.5)) list_num.append(scores.count(1)) list_num.append(scores.count(1.5)) list_num.append(scores.count(2)) list_num.append(scores.count(2.5)) list_num.append(scores.count(3)) list_num.append(scores.count(3.5)) list_num.append(scores.count(4)) list_num.append(scores.count(4.5)) list_num.append(scores.count(5)) attr = ["0", "0.5", "1","1.5","2","2.5", "3", "3.5","4","4.5","5"] pie = Pie("評分等級環狀餅圖",title_pos="center") pie.add("", attr, list_num, is_label_show=True, label_text_color=None, radius=[40, 75], legend_orient="vertical", legend_pos="left", legend_top="100px", center=[50,60] ) pie.render("score_pie.html")scores=[]for i in allData: scores.append(i['score'])scoreProcess(scores)# 觀衆分佈圖def cityProcess(citysTotal): from pyecharts import Geo geo =Geo("《何覺得家》觀衆分佈", title_color='#fff', title_pos='center', width=1200,height = 600, background_color = '#404a95') attr, value = geo.cast(citysTotal) geo.add("", attr, value, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff', legend_pos = 'right', is_geo_effect_show = True, maptype='china', symbol_size=10) geo.render("city_geo.html")# 城市名稱的處理citysTotal={}coordinatesJson = pd.read_json('city_coordinates.json',encoding='utf-8')for i in allData: for j in coordinatesJson: if str(i['cityName']) in str(j) : if str(j) not in citysTotal: citysTotal[str(j)]=1 else: citysTotal[str(j)]=citysTotal[str(j)]+1 breakcityProcess(citysTotal)