爬蟲大做業

1.選一個本身感興趣的主題。html

2.用python 編寫爬蟲程序,從網絡上爬取相關主題的數據。python

3.對爬了的數據進行文本分析,生成詞雲。正則表達式

4.對文本分析結果進行解釋說明。數組

5.寫一篇完整的博客,描述上述實現過程、遇到的問題及解決辦法、數據分析思想及結論。網絡

6.最後提交爬取的所有數據、爬蟲及數據分析源代碼。app

 
  

import requests
from bs4 import BeautifulSoup
import re
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plturl

 
  

 

 
  


# 將詞雲寫入到文件
def writeFilekeynews(keywords):
    f = open('keyword.txt', 'a', encoding='utf-8')
    for word in keywords:
        f.write(" "+word)
    f.close()spa

 
  

#將新聞內容寫入文件
def writeFilecontent(contnet):code

 
  

    f = open('content.txt', 'a', encoding='utf-8')
    f.write("\n"+contnet)
    f.close()orm

 
  


def getWordCloud():
    keynewsTowordcloud = open('keyword.txt', 'r', encoding='utf-8').read()
    print(keynewsTowordcloud)
    wc = WordCloud(font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf', background_color='white',max_words=150).generate(keynewsTowordcloud).to_file("wordcloud.jpg")
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

 
  


def getKeynews(content):
    content = ''.join(re.findall('[\u4e00-\u9fa5]', content))  # 經過正則表達式選取中文字符數組,拼接爲無標點字符內容
    # 去掉重複的字符生成集合
    newSet = set(jieba._lcut(content))

 
  

    newDict = {}
    for i in newSet:
        newDict[i] = content.count(i)
    deleteList, keynews = [], []
    for i in newDict.keys():
        if len(i) < 2:
         deleteList.append(i)  #去掉單音無心義字符
    deleteList.append('編輯')
    for i in deleteList:
        del newDict[i]
    dictList = list(newDict.items())
    dictList.sort(key=lambda item: item[1], reverse=True)  # 排序,返回前三關鍵字
    for dict in dictList:
        keynews.append(dict[0])
    return keynews

 
  


def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    content = soupd.select('.artical-main-content')[0].text

 
  

    writeFilecontent(content)

 
  

    keynews = getKeynews(content)

 
  

    writeFilekeynews(keynews)

 
  


def Get_page(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    # print(soup.select('.tag-list-box')[0].select('.list'))
    for new in soup.select('.tag-list-box')[0].select('.list'):
        # print(new.select('.list-content')[0] .select('.name')[0].select('.n1')[0].select('a')[0]['href'])
        url = new.select('.list-content')[0].select('.name')[0].select('.n1')[0].select('a')[0]['href']
        getNewsDetail(url)

 
  


url = 'https://voice.hupu.com/nba/tag/3023-1.html'
resd = requests.get(url)
resd.encoding = 'utf-8'
soup1 = BeautifulSoup(resd.text, 'html.parser')
Get_page(url)
for i in range(2, 4):
    Get_page('https://voice.hupu.com/nba/tag/3023-{}.html'.format(i))

 
  

getWordCloud()

 
  

 

 

截圖:

相關文章
相關標籤/搜索
本站公眾號
   歡迎關注本站公眾號,獲取更多信息