1.選一個本身感興趣的主題(全部人不能雷同)。html
2.用python 編寫爬蟲程序,從網絡上爬取相關主題的數據。python
3.對爬了的數據進行文本分析,生成詞雲。網絡
import matplotlib.pyplot as plt from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS import jieba import numpy as np from PIL import Image #讀入背景圖片 abel_mask = np.array(Image.open("lzl.jpg")) #讀取要生成詞雲的文件 text_from_file_with_apath = open('outcome.txt',encoding='utf-8').read() #經過jieba分詞進行分詞並經過空格分隔 wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True) wl_space_split = " ".join(wordlist_after_jieba) my_wordcloud = WordCloud( background_color='white', mask = abel_mask, max_words = 500, stopwords = {}.fromkeys(['nbsp','br']), max_font_size = 450, random_state = 30, scale=.5 ).generate(wl_space_split) # 根據圖片生成詞雲顏色 image_colors = ImageColorGenerator(abel_mask) # 如下代碼顯示圖片 plt.imshow(my_wordcloud) plt.axis("off") plt.show()
4.對文本分析結果進行解釋說明。dom
5.寫一篇完整的博客,描述上述實現過程、遇到的問題及解決辦法、數據分析思想及結論。url
只是在喜歡的網頁中找到一篇比較熱的文章進行簡單的爬取和詞雲分析spa
6.最後提交爬取的所有數據、爬蟲及數據分析源代碼。3d
import requests from bs4 import BeautifulSoup url = 'https://www.theatlantic.com/science/archive/2018/04/dont-be-afraid-of-the-multiverse/559169/' res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') #print(soup) news = soup.select('.article-body')[0].text def saveNews(content): f = open("articlr.txt", 'a', encoding='utf-8') f.write(content) f.close() """ for new in news: title = new.select('.article-section-1')[0].text href = new.select('a')[0].attrs['href'] print(title) print(href) #for i in range(1,15): """ print(news)