因爲能選擇一個感興趣的網站進行數據分析,因此此次選擇爬取的網站是新華網,其網址爲"http://www.xinhuanet.com/",而後對其進行數據分析並生成詞雲css
運行整個程序相關的代碼包html
import requests import re from bs4 import BeautifulSoup from datetime import datetime import pandas import sqlite3 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt
爬取網頁信息sql
url = "http://www.xinhuanet.com/" f=open("css.txt","w+") res0 = requests.get(url) res0.encoding="utf-8" soup = BeautifulSoup(res0.text,"html.parser") newsgroup=[] for news in soup.select("li"): if len(news.select("a"))>0: print(news.select("a")[0].text) title=news.select("a")[0].text f.write(title) f.close()
存入txt文件中,並進行字詞統計數據庫
f0 = open('css.txt','r') qz=[] qz=f0.read() f0.close() print(qz) words = list(jieba.cut(qz)) ul={':','的','"','、','」','「','。','!',':','?',' ','\u3000',',','\n'} dic={} keys = set(words)-ul for i in keys: dic[i]=words.count(i) c = list(dic.items()) c.sort(key=lambda x:x[1],reverse=True) f1 = open('diectory.txt','w') for i in range(10): print(c[i]) for words_count in range(c[i][1]): f1.write(c[i][0]+' ') f1.close()
存入數據庫網站
df = pandas.DataFrame(words) print(df.head()) with sqlite3.connect('newsdb3.sqlite') as db: df.to_sql('newsdb3',con = db)
製做詞雲url
f3 = open('diectory.txt','r') cy_file = f3.read() f3.close() cy = WordCloud().generate(cy_file) plt.imshow(cy) plt.axis("off") plt.show()
最終成果spa
完整代碼code
import requests import re from bs4 import BeautifulSoup from datetime import datetime import pandas import sqlite3 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt url = "http://www.xinhuanet.com/" f=open("css.txt","w+") res0 = requests.get(url) res0.encoding="utf-8" soup = BeautifulSoup(res0.text,"html.parser") newsgroup=[] for news in soup.select("li"): if len(news.select("a"))>0: print(news.select("a")[0].text) title=news.select("a")[0].text f.write(title) f.close() f0 = open('css.txt','r') qz=[] qz=f0.read() f0.close() print(qz) words = list(jieba.cut(qz)) ul={':','的','"','、','」','「','。','!',':','?',' ','\u3000',',','\n'} dic={} keys = set(words)-ul for i in keys: dic[i]=words.count(i) c = list(dic.items()) c.sort(key=lambda x:x[1],reverse=True) f1 = open('diectory.txt','w') for i in range(10): print(c[i]) for words_count in range(c[i][1]): f1.write(c[i][0]+' ') f1.close() df = pandas.DataFrame(words) print(df.head()) with sqlite3.connect('newsdb3.sqlite') as db: df.to_sql('newsdb3',con = db) f3 = open('diectory.txt','r') cy_file = f3.read() f3.close() cy = WordCloud().generate(cy_file) plt.imshow(cy) plt.axis("off") plt.show()