爬蟲項目 - JavaShuo

簡單爬取簡書中的專題‘’@IT·互聯網「中的文章，爬取信息以後經過jieba分詞生成詞雲而且進行分析；html

2.實現過程：瀏覽器

第一步：打開簡書並進入到@IT-互聯網專題安全

網頁連接：https://www.jianshu.com/c/V2CqjW?utm_medium=index-collections&utm_source=desktopapp

經過觀察，咱們能夠發現網頁中的文章並無分頁，而是經過下拉滾動條js生成下一頁dom

咱們能夠進入開發者工具觀察得知，每次拉到網頁的最後都會多一條請求，仔細觀察它們之間是存在着必定的規律的工具

它們都是https://www.jianshu.com/c/V2CqjW?order_by=added_at&page={}這樣的格式，改變的值只是page中的數字，是否這就是咱們所須要的頁碼呢，能夠經過訪問途中連接驗證。post

如今咱們已經取得所須要的連接，即可寫出循環的代碼，url

可是咱們並不知道具體有多少頁，這時，咱們經過觀察網頁以及網頁源碼，能夠發現spa

在專題下面有收錄了多少篇文章的字樣，即咱們只須要獲取到共有多少篇文章再除以每頁多少篇文章便可得出總頁數。分析源碼能夠輕鬆找到excel

而後咱們就能夠寫出如下代碼來獲取它的頁數

注意，因爲網頁的安全性問題，直接使用requests,get(url)是沒法獲取到簡書網頁的源碼的，因此咱們要加上瀏覽器信息

獲取方法

接着，編寫代碼

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }

def getPageN():
    url = 'https://www.jianshu.com/c/V2CqjW?utm_medium=index-collections&utm_source=desktop'
    resp = requests.get(url, headers=headers)
    html_content = resp.text  # 獲得網頁內容
    soup = BeautifulSoup(html_content, 'lxml')  # 開始解析
    info = soup.select('.info')[0].text
    pagenumber=int(info[info.find('收錄了'):].split()[0].lstrip('收錄了').rstrip('篇文章'))
    a = len(soup.find_all('a', class_='title'))
    page = pagenumber//a+1
    return page

第二步：取出一個文章列表頁的所有文章

觀察網頁源碼可知道每篇文章的具體連接是

最後經過循環得到全部文章的連接

def getListPage(pageUrl):   
    res = requests.get(pageUrl,headers=headers)
    html_content = res.text  
    soup = BeautifulSoup(html_content, 'lxml')

    newslist = []
    for i in range(len(soup.find_all('a', class_='title'))):
        Url = soup.find_all('a', class_='title')[i].attrs['href']
        newsUrl = "https://www.jianshu.com" + Url
        newslist.append(getNewsDetail(newsUrl))

    return(newslist)

第三步：得到一篇文章的所有內容，並進行分析

def getNewsDetail(newsUrl):   #一篇文章的所有內容
    resd = requests.get(newsUrl,headers=headers)
    html_content = resd.text
    soupd = BeautifulSoup(html_content, 'lxml')

    news = {}
    news['標題'] = soupd.select('.title')[0].text
    news['做者'] = soupd.select('.name')[0].text
    news['時間'] = datetime.strptime(soupd.select('.publish-time')[0].text.rstrip('*'), '%Y.%m.%d %H:%M')
    news['字數'] = soupd.select('.wordage')[0].text.lstrip('字數 ')
    # news['內容'] = soupd.select('.show-content-free')[0].text.strip()
    news['連接'] = newsUrl
    content= soupd.select('.show-content-free')[0].text.strip()
    writeNewsDetail(content)
    return(news)

到這裏，基本的爬取工做已經完成了

3.把數據保存成文本：

def writeNewsDetail(content):
    f = open('content.txt','a',encoding='utf-8')
    f.write(content)
    f.close()

以及生成excel表格

import pandas
df = pandas.DataFrame(newstotal)
df.to_excel('簡書數據.xlsx')

4.生成詞雲：

file = codecs.open('content.txt', 'r', 'utf-8')
image=np.array(Image.open('ditu.jpg'))
font=r'C:\Windows\Fonts\AdobeHeitiStd-Regular.otf'
word=file.read()
#去掉英文，保留中文
resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\！\@\#\\\&\*\%]", "",word)
wordlist_after_jieba = jieba.cut(resultword, cut_all = True)

wl_space_split = " ".join(wordlist_after_jieba)

# 設置停用詞
stopwords = set(STOPWORDS)
stopwords.add("一個")
my_wordcloud = WordCloud(font_path=font,mask=image,stopwords=stopwords,background_color='white',max_words = 2000,max_font_size = 100,random_state=50).generate(wl_space_split)
#根據圖片生成詞雲
iamge_colors = ImageColorGenerator(image)
#my_wordcloud.recolor(color_func = iamge_colors)
#顯示生成的詞雲
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
#保存生成的圖片，當關閉圖片時纔會生效，中斷程序不會保存
my_wordcloud.to_file('result.jpg')

生成的詞雲圖片：

代碼實現：

import re
import requests
import pandas
from bs4 import BeautifulSoup 
from datetime import datetime
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import codecs
import numpy as np
from PIL import Image

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }

def writeNewsDetail(content):
    f = open('content.txt','a',encoding='utf-8')
    f.write(content)
    f.close()

def getNewsDetail(newsUrl):   #一篇文章的所有內容
    resd = requests.get(newsUrl,headers=headers)
    html_content = resd.text
    soupd = BeautifulSoup(html_content, 'lxml')

    news = {}
    news['標題'] = soupd.select('.title')[0].text
    news['做者'] = soupd.select('.name')[0].text
    news['時間'] = datetime.strptime(soupd.select('.publish-time')[0].text.rstrip('*'), '%Y.%m.%d %H:%M')
    news['字數'] = soupd.select('.wordage')[0].text.lstrip('字數 ')
    # news['內容'] = soupd.select('.show-content-free')[0].text.strip()
    news['連接'] = newsUrl
    content= soupd.select('.show-content-free')[0].text.strip()
    writeNewsDetail(content)
    return(news)

def getListPage(pageUrl):
    res = requests.get(pageUrl,headers=headers)
    html_content = res.text
    soup = BeautifulSoup(html_content, 'lxml')

    newslist = []
    for i in range(len(soup.find_all('a', class_='title'))):
        Url = soup.find_all('a', class_='title')[i].attrs['href']
        newsUrl = "https://www.jianshu.com" + Url
        newslist.append(getNewsDetail(newsUrl))

    return(newslist)


def getPageN():
    url = 'https://www.jianshu.com/c/V2CqjW?utm_medium=index-collections&utm_source=desktop'
    resp = requests.get(url, headers=headers)
    html_content = resp.text  # 獲得網頁內容
    soup = BeautifulSoup(html_content, 'lxml')  # 開始解析
    info = soup.select('.info')[0].text
    pagenumber=int(info[info.find('收錄了'):].split()[0].lstrip('收錄了').rstrip('篇文章'))
    a = len(soup.find_all('a', class_='title'))
    page = pagenumber//a+1
    return page

newstotal = []
firstPageUrl='https://www.jianshu.com/c/V2CqjW?utm_medium=index-collections&utm_source=desktop'
newstotal.extend(getListPage(firstPageUrl))
for i in range(2,201):
    listPageUrl='https://www.jianshu.com/c/V2CqjW?order_by=added_at&page={}'.format(i)
    newstotal.extend(getListPage(listPageUrl))

df = pandas.DataFrame(newstotal)
df.to_excel('簡書數據.xlsx')

file = codecs.open('content.txt', 'r', 'utf-8')
image=np.array(Image.open('ditu.jpg'))
font=r'C:\Windows\Fonts\AdobeHeitiStd-Regular.otf'
word=file.read()
#去掉英文，保留中文
resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\！\@\#\\\&\*\%]", "",word)
wordlist_after_jieba = jieba.cut(resultword, cut_all = True)

wl_space_split = " ".join(wordlist_after_jieba)

# 設置停用詞
stopwords = set(STOPWORDS)
stopwords.add("一個")
my_wordcloud = WordCloud(font_path=font,mask=image,stopwords=stopwords,background_color='white',max_words = 2000,max_font_size = 100,random_state=50).generate(wl_space_split)
#根據圖片生成詞雲
iamge_colors = ImageColorGenerator(image)
#my_wordcloud.recolor(color_func = iamge_colors)
#顯示生成的詞雲
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
#保存生成的圖片，當關閉圖片時纔會生效，中斷程序不會保存
my_wordcloud.to_file('result.jpg')