1.取出一個新聞列表頁的所有新聞 包裝成函數。php
2.獲取總的新聞篇數,算出新聞總頁數。html
3.獲取所有新聞列表頁的所有新聞詳情。python
import requests import re from bs4 import BeautifulSoup from datetime import datetime newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(newsurl) # 返回response對象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') def getNewDetail(pageUrl): res = requests.get(pageUrl) # 返回response對象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title')) > 0: t = news.select('.news-list-title')[0].text # 標題 a = news.select('a')[0].attrs['href'] # 連接 res = requests.get(a) res.encoding = 'utf-8' soupd = BeautifulSoup(res.text, 'html.parser') content = soupd.select('#content')[0].text description = news.select('.news-list-description')[0].text resd = requests.get(a) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') info = soupd.select('.show-info')[0].text d = info.lstrip('發佈時間:')[:19] dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S') author = info[info.find('做者:'):].split()[0].lstrip('做者:') source = info[info.find('來源:'):].split()[0].lstrip('來源:') photo = info[info.find('攝影:'):].split()[0].lstrip('攝影:') print("新聞標題:", t) print("連接:", a) print("發佈時間:", dt) print("做者:", author) print("來源:", source) print("攝影:", photo) print("描述:", description) getClickCount(a) print("正文:", content) def getClickCount(a): newsid = re.search(r"\_(.*).html", a).group(1)[-4:] clickcounturl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid) clickcount = int(requests.get(clickcounturl).text.split(".html(")[-1].lstrip("'").rstrip("');")) print('點擊次數:',clickcount) def getpagelist(path): res = requests.get(path) # 返回response對象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') newsnum=int(soup.select('.a1')[0].text.rstrip('條')) #新聞總條數 if(newsnum%10==0): totalpage=newsnum//10 else: totalpage=newsnum//10+1 #新聞總頁數 for i in range(1,totalpage): pageUrl = path + '{}.html'.format(i) getNewDetail(pageUrl) getpagelist(newsurl)
4.找一個本身感興趣的主題,進行數據爬取,並進行分詞分析。不能與其它同窗雷同。api
# 爬取環球科技網新聞信息 import requests from bs4 import BeautifulSoup from datetime import datetime import jieba newsurl = 'http://tech.huanqiu.com/internet/' def sort(text): str = '''一!「」,。?;’"',.、:\n''' for s in str: text = text.replace(s, ' ') wordlist = list(jieba.cut(text)) exclude = {'這', '\u3000', '\r', '\xa0', '的', '_', ' ', '將', '在', '是', '了', '一', '還', '也', '《', '》', '(', ')'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) print("top5關鍵詞:") for i in range(5): print(dictlist[i]) def getContent(url): res = requests.get(url) res.encoding = 'utf-8' soup2 = BeautifulSoup(res.text, 'html.parser') for news in soup2.select('.l_a'): if len(news.select('.author'))>0: author=news.select('.author')[0].text print("做者",author) content = soup2.select('.la_con')[0].text.rstrip('AD_SURVEY_Add_AdPos("7000531");') print("正文:", content) sort(content) def getNewDetails(newsurl): res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('.item'): # print(news) title = news.select('a')[0].attrs['title'] a = news.select('a')[0].attrs['href'] brief = news.select('h5')[0].text.rstrip('[詳細]') time = news.select('h6')[0].text dt = datetime.strptime(time, '%Y-%m-%d %H:%M') print("新聞標題:", title) print("連接:", a) print("內容簡介:", brief) print("時間:", dt) getContent(a) print('\n') # break res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') getNewDetails(newsurl) # for total in soup.select('#pages'): # all=int(total.select('a')[0].text.rstrip('條')) #獲取總條數計算總頁數 # #print(all) # if(all%60==0): # totalpages=all//60 # else: # totalpages=all//60+1 # print(totalpages) # for i in range(1,totalpages+1): #全部頁面的新聞信息 # PageUrl = newsurl + '{}.html'.format(i) # getNewDetails(PageUrl)