1 import requests 2 from lxml import etree 3 4 '''訪問「8684公交查詢網」,抓取太原市公交路線:''' 5 6 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 7 'AppleWebKit/537.36 (KHTML, like Gecko) ' 8 'Chrome/73.0.3683.86 Safari/537.36'} 9 10 #用來保存公交路線信息 11 items = [] 12 13 def get_navigation(): 14 '''獲取內容''' 15 url = 'https://taiyuan.8684.cn/' 16 print('正在獲取導航連接') 17 r = requests.get(url,headers=headers) 18 19 #解析內容,獲取導航跳轉連接 20 tree = etree.HTML(r.text) 21 #查找以數字開頭的路線連接 22 num_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href') 23 # 查找以字母開頭的路線連接 24 char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href') 25 #返回導航連接 26 return num_href_list + char_href_list 27 28 def get_route(navi_list): 29 #循環遍歷連接列表,發送請求,獲取每一個連接下的公交路線 30 route_list = [] 31 for i in navi_list: 32 route_url = 'https://taiyuan.8684.cn' + i 33 print('正在獲取以%s開頭的公交路線' %i) 34 r = requests.get(route_url,headers=headers) 35 36 #解析內容,獲取公交路線 37 tree = etree.HTML(r.text) 38 href_list = tree.xpath('//div[@id="con_site_1"]/a/@href') 39 for href in href_list: 40 route_list.append(href) 41 return route_list 42 43 def get_info(route_list): 44 for route in route_list: 45 info_url = 'https://taiyuan.8684.cn' + route 46 r = requests.get(info_url,headers=headers) 47 48 #解析獲取具體信息 49 tree = etree.HTML(r.text) 50 route_name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0] 51 print('正在獲取%s的路線信息' % route_name) 52 run_time = tree.xpath('//p[@class="bus_i_t4"][1]/text()')[0] 53 ticket_price = tree.xpath('//p[@class="bus_i_t4"][2]/text()')[0] 54 update_time = tree.xpath('//p[@class="bus_i_t4"][4]/text()')[0] 55 station_num = tree.xpath('//div[@class="bus_line_top "]/span/text()') 56 if len(station_num) == 2: 57 up_num = station_num[0] 58 up_station_name = tree.xpath('//div[@class="bus_line_site "][1]/div/div/a/text()') 59 down_num = station_num[1] 60 down_station_name = tree.xpath('//div[@class="bus_line_site "][2]/div/div/a/text()') 61 else: 62 up_num = station_num[0] 63 down_num = station_num[0] 64 up_station_name = tree.xpath('//div[@class="bus_line_site "]/div/div/a/text()') 65 down_station_name = tree.xpath('//div[@class="bus_line_site "]/div/div/a/text()') 66 67 #寫入字典 68 item = {'路線名':route_name, 69 '運行時間':run_time, 70 '票價':ticket_price, 71 '更新時間':update_time, 72 '上行站數':up_num, 73 '上行站名':up_station_name, 74 '下行站數':down_num, 75 '下行站名':down_station_name,} 76 77 items.append(item) 78 79 def main(): 80 #獲取全部公交路線導航連接 81 navi_list = get_navigation() 82 print('導航連接爬取完畢') 83 84 #循環遍歷導航連接列表,找到全部公交路線 85 route_list = get_route(navi_list) 86 print('公交路線爬取完畢') 87 88 # 遍歷路線表,獲取具體信息 89 info_list = get_info(route_list) 90 print('具體信息爬取完畢') 91 92 #爬取完畢,寫入文件 93 fp = open('8684_太原公交路線.txt','w',encoding='utf8') 94 for item in items: 95 fp.write(str(item) + '\n') 96 fp.close() 97 98 if __name__ == '__main__': 99 main()