獲取所有校園新聞

1.取出一個新聞列表頁的所有新聞 包裝成函數。php

2.獲取總的新聞篇數,算出新聞總頁數。html

3.獲取所有新聞列表頁的所有新聞詳情。正則表達式

 

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

# 獲取新聞點擊次數
def getNewsId(url):
    newsId = re.search(r'\_\d{4}\/((.*)).html', url).group(1)
    clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
    clickRes = requests.get(clickUrl)
    # 利用正則表達式獲取新聞點擊次數
    clickCount = int(re.search("hits'\).html\('(.*)'\);", clickRes.text).group(1))
    return clickCount


# 獲取新聞細節
def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')

    content = soupd.select('#content')[0].text
    info = soupd.select('.show-info')[0].text
    # 調用getNewsId()獲取點擊次數
    count = getNewsId(newsUrl)
    # 識別時間格式
    date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1)
    # 識別一個至三個數據
    if(info.find('做者:')>0):
        author = re.search('做者:((.{2,4}\s|.{2,4}、|\w*\s){1,3})', info).group(1)
    else:
        author = ''
    if(info.find('審覈:')>0):
        check = re.search('審覈:((.{2,4}\s){1,3})', info).group(1)
    else:
        check = ''
    if(info.find('來源:')>0):
        sources = re.search('來源:(.*)\s*攝|點', info).group(1)
    else:
        sources = ''
    if (info.find('攝影:') > 0):
        photo = re.search('攝影:(.*)\s*點', info).group(1)
    else:
        photo = ''
    # 用datetime將時間字符串轉換爲datetime類型
    dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    # 利用format對字符串進行操做
    print('發佈時間:{0}\n做者:{1}\n審覈:{2}\n來源:{3}\n攝影:{4}\n點擊次數:{5}'.format(dateTime, author, check, sources, photo, count))
    print(content)


def getListPage(listUrl):
    res = requests.get(listUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')

    for new in soup.select('li'):
        if len(new.select('.news-list-title')) > 0:
            title = new.select('.news-list-title')[0].text
            description = new.select('.news-list-description')[0].text
            newsUrl = new.select('a')[0]['href']

            print('標題:{0}\n內容:{1}\n連接:{2}'.format(title, description, newsUrl))
            # 調用getNewsDetail()獲取新聞詳情
            getNewsDetail(newsUrl)
            break


listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
getListPage(listUrl)
res = requests.get(listUrl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
listCount = int(soup.select('.a1')[0].text.rstrip(''))//10+1

for i in range(2,listCount):
    listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    getListPage(listUrl)

 

4.找一個本身感興趣的主題,進行數據爬取,並進行分詞分析。不能與其它同窗雷同。api

爬取網易科技頻道IT專題:數組

 

 代碼:app

import requests, re, jieba
from bs4 import BeautifulSoup
from datetime import datetime

# 獲取新聞細節
def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'gb2312'
    soupd = BeautifulSoup(resd.text, 'html.parser')

    content = soupd.select('#endText')[0].text
    info = soupd.select('.post_time_source')[0].text
    date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1)  # 識別時間格式
    dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')  # 用datetime將時間字符串轉換爲datetime類型
    sources = re.search('來源:\s*(.*)', info).group(1)
    keyWords = getKeyWords(content)
    print('發佈時間:{0}\n來源:{1}'.format(dateTime, sources))
    print('關鍵詞:{}、{}、{}'.format(keyWords[0], keyWords[1], keyWords[2]))
    print(content)

# 經過jieba分詞,獲取新聞關鍵詞
def getKeyWords(content):
    content = ''.join(re.findall('[\u4e00-\u9fa5]', content))  # 經過正則表達式選取中文字符數組,拼接爲無標點字符內容
    wordSet = set(jieba._lcut(content))
    wordDict = {}
    for i in wordSet:
        wordDict[i] = content.count(i)
    deleteList, keyWords = [], []
    for i in wordDict.keys():
        if len(i) < 2:
            deleteList.append(i)  # 去掉單字無心義字符
    for i in deleteList:
        del wordDict[i]
    dictList = list(wordDict.items())
    dictList.sort(key=lambda item: item[1], reverse=True)  # 排序,返回前三關鍵字
    for i in range(3):
        keyWords.append(dictList[i][0])
    return keyWords

# 獲取一頁的新聞
def getListPage(listUrl):
    res = requests.get(listUrl)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'html.parser')
    for new in soup.select('#news-flow-content')[0].select('li'):
        newsUrl = new.select('a')[0]['href']
        title = new.select('a')[0].text
        print('標題:{0}\n連接:{1}'.format(title, newsUrl))
        getNewsDetail(newsUrl)  # 調用getNewsDetail()獲取新聞詳情
        break  # z只獲取單個新聞,若要獲取整頁則去掉break

listUrl = 'http://tech.163.com/it/'
getListPage(listUrl)  # 獲取首頁新聞
for i in range(2, 20):  # 由於網易新聞頻道只存取20頁新聞,直接設置20
    listUrl = 'http://tech.163.com/special/it_2016_%02d/' % i  # 填充新聞頁,頁面格式爲兩位數字字符
    getListPage(listUrl)

結果截圖:函數

相關文章
相關標籤/搜索
本站公眾號
   歡迎關注本站公眾號,獲取更多信息