1.爬取目標html
爬取http://www.zuihaodaxue.com/rankings.html中的以下連接中的具體數據:數據庫
具體頁面中數據示例以下:app
2.爬取步驟工具
(1)爬取http://www.zuihaodaxue.com/rankings.html頁面中的10個連接地址及其文字描述post
(2)根據連接地址爬取詳細頁面中的信息,每個詳細頁面的信息存入列表中,列表中第一個數據存儲的是排名名稱,列表的第二個數據存儲的是每個詳細頁面中的表格中的表頭。例如上面的拍, 、學校名稱、省市等信息,後面的數據是表格下面每一行的數,最後將每個詳細頁面的信息列表添加到結果列表中ui
(3)因時間關係未將數據存入數據庫,直接輸出顯示數據url
3.使用工具spa
requests請求庫及BeautifulSoup解析庫code
4.代碼xml
1 # __author__ = "wyb" 2 # date: 2018/5/22 3 import time 4 import requests 5 from bs4 import BeautifulSoup 6 7 headers = { 8 "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36" 9 } 10 11 12 # 獲取http://www.zuihaodaxue.com/rankings.html中的10個連接 13 def get_link(): 14 urls = [] 15 url = "http://www.zuihaodaxue.com/rankings.html" 16 html = requests.get(url, headers=headers) 17 html.encoding = html.apparent_encoding 18 soup = BeautifulSoup(html.text, "lxml") 19 # links爲連接所在的a標籤,names爲連接對應的描述信息 20 links = soup.select("div.row div.col-lg-12.col-md-12.col-sm-12.col-xs-12 div.smallpic a") 21 for link in links: 22 href = link["href"] 23 href = "http://www.zuihaodaxue.com/" + href 24 urls.append(href) 25 # print(href) 26 # print(len(links)) 27 28 return urls 29 30 31 # 獲取網頁中的信息 32 def get_info(links): 33 info_s = [] 34 for link in links: 35 info = [] 36 html = requests.get(link, headers=headers) 37 html.encoding = html.apparent_encoding 38 soup = BeautifulSoup(html.text, "lxml") 39 # 獲取排名名稱 40 ranking_name = soup.select("h3.post-title")[0].text.strip() 41 info.append(ranking_name) 42 # 獲取表格頭部信息 43 items = soup.select("table.table-bordered thead th") 44 head = [] 45 for item in items: 46 head.append(item.text) 47 info.append(head) 48 # 獲取表格主體信息 49 items = soup.select("table.table-bordered tbody tr") 50 for item in items: 51 body = [] 52 tds = item.select("td") 53 for td in tds: 54 body.append(td.text) 55 info.append(body) 56 info_s.append(info) 57 return info_s 58 59 60 # 展現獲取的信息 61 def show_info(result): 62 # 循環10個頁面爬取的結果 63 for r in result: 64 print(r[0]) 65 print(r[1]) 66 for index in range(2, len(r)): 67 items = r[index] 68 for item in items: 69 print(item, end=" ") 70 print() 71 72 73 if __name__ == '__main__': 74 start_time = time.time() 75 urls = get_link() 76 res = get_info(urls) 77 show_info(res) 78 end_time = time.time() 79 print(end_time-start_time)