1. 將新聞的正文內容保存到文本文件。php
2. 將新聞數據結構化爲字典的列表:html
3. 安裝pandas,用pandas.DataFrame(newstotal),建立一個DataFrame對象df.正則表達式
4. 經過df將提取的數據保存到csv或excel 文件。api
import requests, re, pandas, openpyxl from bs4 import BeautifulSoup from datetime import datetime # 獲取新聞點擊次數 def getnewsclick(url): newsId = re.search(r'\_\d{4}\/(.*).html', url).group(1) clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) clickRes = requests.get(clickUrl) # 利用正則表達式獲取新聞點擊次數 clickCount = int(re.search("hits'\).html\('(.*)'\);", clickRes.text).group(1)) return clickCount # 獲取新聞細節 def getnewsdetail(newsurl): resd = requests.get(newsurl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') newsDict = {} content = soupd.select('#content')[0].text info = soupd.select('.show-info')[0].text newsDict['title'] = soupd.select('.show-title')[0].text date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1) # 識別時間格式 # 識別一個至三個數據 if info.find('做者') > 0: newsDict['author'] = re.search('做者:((.{2,4}\s|.{2,4}、){1,3})', info).group(1) else: newsDict['author'] = 'none' if info.find('審覈') > 0: newsDict['check'] = re.search('審覈:((.{2,4}\s){1,3})', info).group(1) else: newsDict['check'] = 'none' if info.find('來源') > 0: first = re.search('來源:(.*)\s*點', info).group(1) if first.find('攝影') > 0: newsDict['sources'] = re.search('(.*)\s*攝', first).group(1) # 解決新聞有攝影沒法匹配正則的問題 else: newsDict['sources'] = first else: newsDict['sources'] = 'none' if info.find('攝影') > 0: newsDict['photo'] = re.search('攝影:(.*)\s*點', info).group(1) else: newsDict['photo'] = 'none' newsDict['dateTime'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') # 用datetime將時間字符串轉換爲datetime類型 newsDict['click'] = getnewsclick(newsurl) # 調用getnewsclick()獲取點擊次數 newsDict['content'] = content return newsDict def getlistpage(listurl): # 獲取一頁的新聞,轉換成列表返回 res = requests.get(listurl) res.encoding = 'utf-8' listsoup = BeautifulSoup(res.text, 'html.parser') pagelist = [] for new in listsoup.select('.news-list')[0].select('li'): newsUrl = new.select('a')[0]['href'] pagedict = getnewsdetail(newsUrl) # 調用getnewsdetail()獲取新聞詳情 pagelist.append(pagedict) # break return pagelist def gettotalpage(listurl): # 獲取總新聞頁面 res = requests.get(listurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') return int(soup.select('.a1')[0].text.rstrip('條')) // 10 + 1 total = [] listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' pagelist = getlistpage(listUrl) total.extend(pagelist) listCount = gettotalpage(listUrl) pan = pandas.DataFrame(total) pan.to_excel('result.xlsx') # 導出爲Excel表格 pan.to_csv('result.csv') # 導出爲csv文件 for i in range(2,listCount): listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) total = getlistpage(listUrl)
5. 用pandas提供的函數和方法進行數據分析:數據結構
print(pan[['click', 'sources', 'title']].head(6))
print(pan[(pan['click'] > 3000) & (pan['sources'] == '學校綜合辦')])
selectlist = ['國際學院', '學生工做處'] print(pan[pan['sources'].isin(selectlist)])