1.用requests庫和BeautifulSoup4庫,爬取校園新聞列表的時間、標題、連接、來源。html
1 import requests #HTTP庫 從html或xml中提取數據 2 from bs4 import BeautifulSoup #爬蟲庫BeautifulSoup4 3 url = requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/") 4 url.encoding = "utf-8" 5 soup = BeautifulSoup(url.text,'html.parser') 6 #print(soup.head.title.text) 7
8 #找出含有特定標籤的html元素:‘ID’前綴‘#’;‘class’前綴‘.’,其它直接soup.select('p') 9 for news in soup.select('li'): 10 if len(news.select('.news-list-title'))>0: 11 #print(news.select('.news-list-title')) #1.爬出的資料生成一個大列表 12 #print(news.select('.news-list-title')[0]) #2.取出了列表中的第一個元素 13 #print(news.select('.news-list-title')[0].text) #3.Bs4庫text()負責取出文本 14 15 time = news.select('.news-list-info')[0].contents[0].text #時間 16 title = news.select('.news-list-title')[0].text #標題 17 href = news.select('a')[0]['href'] #連接 18 19 href_text = requests.get(href) #連接內容 20 href_text.encoding = "utf-8" 21 href_soup = BeautifulSoup(href_text.text,'html.parser') 22 href_text_body = href_soup.select('.show-content')[0].text 23 print(time,title,href,href_text_body)
2.選一個本身感興趣的主題,作相似的操做,爲「爬取網絡數據並進行文本分析」作準備。網絡
1 import requests 2 from bs4 import BeautifulSoup 3 url= requests.get('https://www.bilibili.com') 4 url.encoding="utf-8" 5 soup=BeautifulSoup(url.text,'html.parser')#'html.parser'是HTML的解析器 6 #print(soup) 7 for titles in soup.select('.nav-name'): 8 # if len(titles.select('.nav-name'))>0: 9 # title=titles.select('.nav-name')[0].text 10 print(titles.text) 11 douga = soup.select('#bili_douga') 12 for rank in donga.select('.rank-list hot-list'): 13 14 15 if len(rank.select('.rank-list hot-list'))>0: 16 ranks=rank.select('.rank-item highlight')[0].text 17 print(ranks)
並不能爬到「動畫」裏這個「排行」的url。動畫