做業要求來自:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3002php
0.重新聞url獲取點擊次數,並整理成函數html
def click(url): newsId=re.search('/(\d*).html',url).group(1) clickUrl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) resClick=requests.get(clickUrl) newsClick=int(re.search("hits'[)].html[(]'(\d+)'[)]",resClick).groups(1)) return newsClick def newsdt(showinfo): newsDate=showinfo.split()[0].split(':')[1] newsTime=showinfo.split()[1] newsDT=newsDate+' '+newsTime dt=datetime.strptime(newsDT,'%Y-%m-%d %H:%M:%S') return dt
1.重新聞url獲取新聞詳情: 字典,anewspython
def anews(url): newsDetail={} res=requests.get(url) res.encoding='utf-8' s=BeautifulSoup(res.text,'html.parser') newsDetail['newsTitle']=s.select('.show-title')[0].text showinfo=s.select('.show-info')[0].text newsDetail['newsDT']=newsdt(showinfo) newsDetail['newsClick']=click(url) return newsDetail
2.從列表頁的url獲取新聞url:列表append(字典) alistapi
def alist(url): res=requests.get(listUrl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') newsList=[] for news in soup.select('li'): if len(news.select('.news-list-title'))>0: newsUrl=news.select('a')[0]['href'] newsDest=news.select('.news-list-description')[0].text newsDict=anews(newsUrl) newsDict['newsUrl'] = newsUrl l newsDict['description']=newsDest newsList.append(newsDict) return newsList listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/' alist(listUrl)
3.生成所頁列表頁的url並獲取所有新聞 :列表extend(列表) allnewsapp
*每一個同窗爬學號尾數開始的10個列表頁dom
allnews = [] for i in range(16,26): listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) allnews.extend(alist(listUrl))
4.設置合理的爬取間隔ide
import time函數
import randomurl
time.sleep(random.random()*3)spa
for i in range(5): time.sleep(random.random()*3)
5.用pandas作簡單的數據處理並保存
保存到csv或excel文件
newsdf.to_csv(r'F:\duym\爬蟲\gzccnews.csv')
pd.Series(allnews) newsdf=pd.DataFrame(allnews) newsdf.to_csv=(r'C:\Users\Czc\PycharmProjects\news.csv')
運行結果:
6.完整代碼:
1 import requests 2 from bs4 import BeautifulSoup 3 from datetime import datetime 4 import re 5 import pandas as pd 6 import time 7 import random 8 9 def click(url): 10 newsId = re.search('/(\d*).html', url).group(1) 11 clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) 12 resClick = requests.get(clickUrl) 13 newsClick = int(resClick.text.split('.html')[-1].lstrip("('").rstrip("');")) 14 return newsClick 15 16 def newsdt(showinfo): 17 newsDate = showinfo.split()[0].split(':')[1] 18 newsTime = showinfo.split()[1] 19 newsDT = newsDate + ' ' + newsTime 20 dt = datetime.strptime(newsDT, '%Y-%m-%d %H:%M:%S') 21 return dt 22 23 def anews(url): 24 newsDetail = {} 25 res = requests.get(url) 26 res.encoding = 'utf-8' 27 s = BeautifulSoup(res.text, 'html.parser') 28 newsDetail['newsTitle'] = s.select('.show-title')[0].text 29 showinfo = s.select('.show-info')[0].text 30 newsDetail['newsDT'] = newsdt(showinfo) 31 newsDetail['newsClick'] = click(url) 32 return newsDetail 33 34 def alist(url): 35 res = requests.get(listUrl) 36 res.encoding = 'utf-8' 37 soup = BeautifulSoup(res.text, 'html.parser') 38 newsList = [] 39 for news in soup.select('li'): 40 if len(news.select('.news-list-title')) > 0: 41 newsUrl = news.select('a')[0]['href'] 42 newsDest = news.select('.news-list-description')[0].text 43 newsDict = anews(newsUrl) 44 newsDict['newsUrl'] = newsUrl 45 newsDict['description'] = newsDest 46 newsList.append(newsDict) 47 return newsList 48 49 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' 50 alist(listUrl) 51 res = requests.get(listUrl) 52 res.encoding = 'utf-8' 53 soup = BeautifulSoup(res.text, 'html.parser') 54 for news in soup.select('li'): 55 if len(news.select('.news-list-title')) > 0: 56 newsUrl = news.select('a')[0]['href'] 57 58 59 i = int(soup.select('#pages')[0].text.split('..')[1].rstrip(' 下一頁 ')) 60 allnews = [] 61 for i in range(16, 26): 62 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) 63 allnews.extend(alist(listUrl)) 64 65 res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') 66 res.encoding = 'utf-8' 67 soup = BeautifulSoup(res.text, 'html.parser') 68 for news in soup.select('li'): 69 if len(news.select('.news-list-title')) > 0: 70 newsUrl = news.select('a')[0]['href'] 71 72 pd.Series(anews) 73 newsdf = pd.DataFrame(allnews) 74 newsdf.to_csv(r'C:\Users\Czc\PycharmProjects\news.csv') 75 76 for i in range(5): 77 time.sleep(random.random() * 3) 78 print(newsdf)