# default_exp digdata # 上面一行用於nbdev中聲明本模塊的名稱。必須是notebook的第一個Cell的第一行。
#hide from nbdev.showdoc import *
#export from bs4 import BeautifulSoup from parser import * #regex_parser import re import json import time import logging import datetime import requests import pprint
#export #url = "https://3g.dxy.cn/newh5/view/pneumonia" url = "https://ncov.dxy.cn/ncovh5/view/pneumonia?from=singlemessage&isappinstalled=0" headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' }
#export session = requests.session() session.headers.update(headers) r = session.get(url)
#export #pprint.pprint(r.text)
#export #soup = BeautifulSoup(r.content, 'lxml') #soup
# export # 分爲整體狀況、分省狀況、省內各市狀況、新聞四大類。 overall_information = re.search(r'\{("id".*?)\}', str(soup.find('script', attrs={'id': 'getStatisticsService'}))) province_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService1'}))) area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) news_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'})))
#pprint.pprint(overall_information.string)
#overall_information.group(0)
#jsall = json.loads(overall_information.group(0))
def overall_parser(overall_information): overall_information = json.loads(overall_information.group(0)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information['countRemark'].replace(' 疑似', ',疑似').replace(' 治癒', ',治癒').replace(' 死亡', ',死亡').replace(' ', '')python
#overall_information = json.loads(overall_information.group(0))
#provinces = json.loads(province_information.group(0)) #provinces
def province_parser(province_information): provinces = json.loads(province_information.group(0)) crawl_timestamp = "" for province in provinces:git
province.pop('id') province['comment'] = province['comment'].replace(' ', '') province['crawlTime'] = crawl_timestamp #province['country'] = country_type.get(province['countryType']) province['tags'] = province['tags'].replace(' ', '') province = regex_parser(content=province, key='tags')
#for province in provinces: # print(province['id'],'\t',province['provinceShortName'],'\t',province['tags'])
#area_information.string
area = json.loads(area_information.group(0))
print("省份\t確診\t疑似\t治癒\t死亡") for a in area: print(a['provinceName'],'\t',a['confirmedCount'],'\t',a['suspectedCount'],'\t',a['curedCount'],'\t',a['deadCount'])
cities = area[0]['cities'] #cities
print("城市\t確診\t疑似\t治癒\t死亡") for p in area: cities = p['cities'] print("===================================") print(p['provinceName'],'\t',p['confirmedCount'],'\t',p['suspectedCount'],'\t',p['curedCount'],'\t',p['deadCount']) print("-----------------------------------") for c in cities: print(c['cityName'],'\t',c['confirmedCount'],'\t',c['suspectedCount'],'\t',c['curedCount'],'\t',c['deadCount'])
news = json.loads(news_information.group(0)) #news
for n in news: print(n['id'],'\t',n['infoSource'].strip(),'\t',n['title'].strip())#,n['summary'].strip())
# 將notebook轉化爲python的*.py代碼,保存到項目名稱的子目錄中。 from nbdev.export import * notebook2script()
Converted 00_digdata.ipynb. Converted 01_getdata.ipynb. Converted 10_charts.ipynb. Converted 10_china.ipynb. Converted index.ipynb.
help(notebook2script)
Help on function notebook2script in module nbdev.export: notebook2script(fname=None, silent=False, to_dict=False) Convert notebooks matching `fname` to modules