做業要求來自於:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/2894php
給定一篇新聞的連接newsUrl,獲取該新聞的所有信息html
標題、做者、發佈單位、審覈、來源正則表達式
發佈時間:轉換成datetime類型api
點擊:函數
整個過程包裝成一個簡單清晰的函數。url
嘗試去爬取一個你感興趣的網頁。spa
# 獲取一篇新聞的所有信息 import re import requests from bs4 import BeautifulSoup from datetime import datetime # 獲取新聞id def newsnum(url): newsid = re.match('http://news.gzcc.cn/html/2019/meitishijie_0321/(.*).html', url).group(1) return newsid # 發佈時間:datetime類型 def newstime(soup): newsdate = soup.select('.show-info')[0].text.split()[0].split(':')[1] newstime = soup.select('.show-info')[0].text.split()[1] time = newsdate + ' ' + newstime time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') return time # 獲取點擊次數 def click(url): id = re.findall('(\d{1,5})', url)[-1] clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id) res = requests.get(clickUrl) click = res.text.split('.html')[-1].lstrip("('").rstrip("');") return click # 主函數 def main(url): res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') #print(soup.select('.show-info')[0].text.split()); print("新聞編號:" + newsnum(url)); # 新聞編號id print("標題:" + soup.select('.show-title')[0].text); # 標題 print("發佈時間:" + str(newstime(soup))); # 發佈時間 print(soup.select('.show-info')[0].text.split()[2]); # 做者 print(soup.select('.show-info')[0].text.split()[3]); # 審覈 print(soup.select('.show-info')[0].text.split()[4]); # 來源 print("內容:" + soup.select('.show-content p')[0].text); # 內容 return url = "http://news.gzcc.cn/html/2019/meitishijie_0321/11033.html" main(url)