<爬蟲實例> 8684公交網-太原公交線路信息

 1 import requests
 2 from lxml import etree
 3 
 4 '''訪問「8684公交查詢網」,抓取太原市公交路線:'''
 5 
 6 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
 7                          'AppleWebKit/537.36 (KHTML, like Gecko) '
 8                          'Chrome/73.0.3683.86 Safari/537.36'}
 9 
10 #用來保存公交路線信息
11 items = []
12 
13 def get_navigation():
14     '''獲取內容'''
15     url = 'https://taiyuan.8684.cn/'
16     print('正在獲取導航連接')
17     r = requests.get(url,headers=headers)
18 
19     #解析內容,獲取導航跳轉連接
20     tree = etree.HTML(r.text)
21     #查找以數字開頭的路線連接
22     num_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
23     # 查找以字母開頭的路線連接
24     char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
25     #返回導航連接
26     return num_href_list + char_href_list
27 
28 def get_route(navi_list):
29     #循環遍歷連接列表,發送請求,獲取每一個連接下的公交路線
30     route_list = []
31     for i in navi_list:
32         route_url = 'https://taiyuan.8684.cn' + i
33         print('正在獲取以%s開頭的公交路線' %i)
34         r = requests.get(route_url,headers=headers)
35 
36         #解析內容,獲取公交路線
37         tree = etree.HTML(r.text)
38         href_list = tree.xpath('//div[@id="con_site_1"]/a/@href')
39         for href in href_list:
40             route_list.append(href)
41     return route_list
42 
43 def get_info(route_list):
44     for route in route_list:
45         info_url = 'https://taiyuan.8684.cn' + route
46         r = requests.get(info_url,headers=headers)
47 
48         #解析獲取具體信息
49         tree = etree.HTML(r.text)
50         route_name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
51         print('正在獲取%s的路線信息' % route_name)
52         run_time = tree.xpath('//p[@class="bus_i_t4"][1]/text()')[0]
53         ticket_price = tree.xpath('//p[@class="bus_i_t4"][2]/text()')[0]
54         update_time = tree.xpath('//p[@class="bus_i_t4"][4]/text()')[0]
55         station_num = tree.xpath('//div[@class="bus_line_top "]/span/text()')
56         if len(station_num) == 2:
57             up_num = station_num[0]
58             up_station_name = tree.xpath('//div[@class="bus_line_site "][1]/div/div/a/text()')
59             down_num = station_num[1]
60             down_station_name = tree.xpath('//div[@class="bus_line_site "][2]/div/div/a/text()')
61         else:
62             up_num = station_num[0]
63             down_num = station_num[0]
64             up_station_name = tree.xpath('//div[@class="bus_line_site "]/div/div/a/text()')
65             down_station_name = tree.xpath('//div[@class="bus_line_site "]/div/div/a/text()')
66 
67         #寫入字典
68         item = {'路線名':route_name,
69                 '運行時間':run_time,
70                 '票價':ticket_price,
71                 '更新時間':update_time,
72                 '上行站數':up_num,
73                 '上行站名':up_station_name,
74                 '下行站數':down_num,
75                 '下行站名':down_station_name,}
76 
77         items.append(item)
78 
79 def main():
80     #獲取全部公交路線導航連接
81     navi_list = get_navigation()
82     print('導航連接爬取完畢')
83 
84     #循環遍歷導航連接列表,找到全部公交路線
85     route_list = get_route(navi_list)
86     print('公交路線爬取完畢')
87 
88     # 遍歷路線表,獲取具體信息
89     info_list = get_info(route_list)
90     print('具體信息爬取完畢')
91 
92     #爬取完畢,寫入文件
93     fp = open('8684_太原公交路線.txt','w',encoding='utf8')
94     for item in items:
95         fp.write(str(item) + '\n')
96     fp.close()
97 
98 if __name__ == '__main__':
99     main()
相關文章
相關標籤/搜索