獲取所有校園新聞

時間 2020-04-07

標籤獲取所有简体版

原文原文鏈接

1.取出一個新聞列表頁的所有新聞包裝成函數。php

2.獲取總的新聞篇數，算出新聞總頁數。html

3.獲取所有新聞列表頁的所有新聞詳情。python

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
res = requests.get(newsurl)  # 返回response對象
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')


def getNewDetail(pageUrl):
    res = requests.get(pageUrl)  # 返回response對象
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.news-list-title')) > 0:
            t = news.select('.news-list-title')[0].text  # 標題
            a = news.select('a')[0].attrs['href']  # 連接
            res = requests.get(a)
            res.encoding = 'utf-8'
            soupd = BeautifulSoup(res.text, 'html.parser')
            content = soupd.select('#content')[0].text
            description = news.select('.news-list-description')[0].text
            resd = requests.get(a)
            resd.encoding = 'utf-8'
            soupd = BeautifulSoup(resd.text, 'html.parser')
            info = soupd.select('.show-info')[0].text
            d = info.lstrip('發佈時間:')[:19]
            dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
            author = info[info.find('做者：'):].split()[0].lstrip('做者：')
            source = info[info.find('來源：'):].split()[0].lstrip('來源：')
            photo = info[info.find('攝影：'):].split()[0].lstrip('攝影：')
            print("新聞標題:", t)
            print("連接:", a)
            print("發佈時間:", dt)
            print("做者:", author)
            print("來源:", source)
            print("攝影:", photo)
            print("描述:", description)
            getClickCount(a)
            print("正文:", content)



def getClickCount(a):
    newsid = re.search(r"\_(.*).html", a).group(1)[-4:]
    clickcounturl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid)
    clickcount = int(requests.get(clickcounturl).text.split(".html(")[-1].lstrip("'").rstrip("');"))
    print('點擊次數:',clickcount)


def getpagelist(path):
        res = requests.get(path)  # 返回response對象
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        newsnum=int(soup.select('.a1')[0].text.rstrip('條'))    #新聞總條數
        if(newsnum%10==0):
            totalpage=newsnum//10
        else:
            totalpage=newsnum//10+1   #新聞總頁數

        for i in range(1,totalpage):
            pageUrl = path + '{}.html'.format(i)
            getNewDetail(pageUrl)
getpagelist(newsurl)

4.找一個本身感興趣的主題，進行數據爬取，並進行分詞分析。不能與其它同窗雷同。api

# 爬取環球科技網新聞信息

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import jieba

newsurl = 'http://tech.huanqiu.com/internet/'


def sort(text):
    str = '''一！「」，。？；’"',.、：\n'''
    for s in str:
        text = text.replace(s, ' ')
    wordlist = list(jieba.cut(text))
    exclude = {'這', '\u3000', '\r', '\xa0', '的', '_', ' ', '將', '在', '是', '了', '一', '還', '也', '《', '》', '(', ')'}
    set2 = set(wordlist) - exclude
    dict = {}
    for key in set2:
        dict[key] = wordlist.count(key)
    dictlist = list(dict.items())
    dictlist.sort(key=lambda x: x[1], reverse=True)
    print("top5關鍵詞:")
    for i in range(5):
        print(dictlist[i])


def getContent(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup2 = BeautifulSoup(res.text, 'html.parser')
    for news in soup2.select('.l_a'):
        if len(news.select('.author'))>0:
            author=news.select('.author')[0].text
            print("做者",author)
    content = soup2.select('.la_con')[0].text.rstrip('AD_SURVEY_Add_AdPos("7000531");')
    print("正文:", content)
    sort(content)


def getNewDetails(newsurl):
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('.item'):
        #  print(news)
        title = news.select('a')[0].attrs['title']
        a = news.select('a')[0].attrs['href']
        brief = news.select('h5')[0].text.rstrip('[詳細]')
        time = news.select('h6')[0].text
        dt = datetime.strptime(time, '%Y-%m-%d %H:%M')
        print("新聞標題:", title)
        print("連接:", a)
        print("內容簡介:", brief)
        print("時間:", dt)
        getContent(a)
        print('\n')
    # break


res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
getNewDetails(newsurl)
# for total in soup.select('#pages'):
#     all=int(total.select('a')[0].text.rstrip('條'))  #獲取總條數計算總頁數
#     #print(all)
#     if(all%60==0):
#         totalpages=all//60
#     else:
#         totalpages=all//60+1
#     print(totalpages)
#     for i in range(1,totalpages+1):     #全部頁面的新聞信息
#         PageUrl = newsurl + '{}.html'.format(i)
#         getNewDetails(PageUrl)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。