近期參加一個課題,聊到路線規劃問題,須要搜索兩地點的最短線路距離以及最短用時等狀況,而後就想着用借用百度API,作個參考php
環境:html
python 3.6python
主要問題:web
1. 分析百度官方路線規劃API瞭解到路線規劃須要提供經緯度信息,因而借用百度地理編碼是指將地址或地名等位置描述轉換爲經緯度座標的過程。獲得的座標信息,能夠用於製圖或空間分析操做。(出於國家安全考慮,公佈出來的座標信息通常是通過加偏的。)http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding
2. 根據經緯度信息,現有起點和終點座標值(經緯度lng、lat),目的是經過百度地圖開發者平臺的路線規劃功能獲取起點終點路線規劃距離和預估時長,百度地圖開發者平臺路線規劃使用說明網址爲:http://lbsyun.baidu.com/index.php?title=webapi/direction-api-abroad
3.爬取過程可能會因爲服務器或者參數不知足要求致使爬蟲中斷,注意處理這個問題便可json
4.一是注意源文件的數據格式要轉utf-8;二是修改文件路徑;三是AK須要自行去開發者平臺申請。api
代碼以下(因爲文件不方便上傳,只需稍做修改,便可使用):安全
# -*- coding:utf-8 -*- # ------------------------------ # @Time :2019/5/9 13:32 # @Author :jonie # @Email : # @File :code_get.py # Description: # ------------------------------ import csv import json import time import requests from bs4 import BeautifulSoup from urllib.request import urlopen, quote import json import requests # [113.63095213159264, 34.74830559988335]# origin_path = 'data/賽點.csv' # 原始數據文件路徑 new_path = 'data/地址對應座標.txt' # 爬取數據文件保存路徑 machine_data = csv.reader(open(origin_path, 'r', encoding='utf-8')) # 讀取原始文件數據 for addr in machine_data: # 循環爬取每一條數據 # print(addr[2]) address = addr[1] ak = 'FA8atAaqd1wajikD56lPqtiaNCldeya' url = 'http://api.map.baidu.com/geocoder/v2/?address=' output = 'json' # ak = '你的ak'#需填入本身申請應用後生成的ak add = quote(address) # 本文城市變量爲中文,爲防止亂碼,先用quote進行編碼 url2 = url + add + '&output=' + output + "&ak=" + ak req = urlopen(url2) res = req.read().decode() temp = json.loads(res) lng = temp['result']['location']['lng'] # 獲取經度 lat = temp['result']['location']['lat'] # 獲取緯度 lng = ("%.5f" % lng) lat = ("%.5f" % lat) list1 = [lng, lat,addr[0]] print('百度座標爲:', list1) with open(new_path, 'a', encoding='utf-8') as f: f.write(str(list1)) f.write('\n') f.close() with open("data/賽點信息.csv", 'a', newline='',encoding='utf-8') as t: # numline是來控制空的行數的 writer = csv.writer(t) # 這一步是建立一個csv的寫入器(我的理解) writer.writerow(list1) # 寫入標籤 # writer.writerows(n) # 寫入樣本數據 t.close()
調用百度地圖api獲取起點終點路線規劃距離和預估時長代碼
1 import csv 2 import re 3 import time 4 import json 5 from urllib.request import urlopen 6 import urllib 7 8 # 原數據文件格式csv: 起點緯度 + 起點經度 + 索引 + 終點緯度 + 終點經度 9 origin_path = 'data/b.csv' # 原始數據文件路徑 10 result_path = 'data/result122901.txt' # 爬取數據文件保存路徑 11 12 # 百度地圖提供的api服務網址 13 url_drive = r"http://api.map.baidu.com/direction/v2/driving" # 駕車(routematrix 批量算路) 14 url_ride = r'http://api.map.baidu.com/routematrix/v2/riding?output=json' # 騎行 15 url_walk = r'http://api.map.baidu.com/routematrix/v2/walking?output=json' # 步行 16 url_bus = r'http://api.map.baidu.com/direction/v2/transit?output=json' # bus(direction路線規劃) 17 cod = r"&coord_type=bd09ll" 18 # 聲明座標格式,bd09ll(百度經緯度座標);bd09mc(百度摩卡託座標);gcj02(國測局加密座標),wgs84(gps設備獲取的座標) 19 # AK爲從百度地圖網站申請的祕鑰,額度不夠的時候直接在list後面新增AK就行 20 AK = ['FA8atAaqd1wajikD56lPqtiasdfleCeyz'] 21 # 把變量名先寫入文件 22 colnames = '設備序列號 起點 終點 狀態碼 步行路程(米) 步行耗時(秒)' 23 with open(result_path, 'a', encoding='utf-8') as f: 24 f.write(colnames) 25 f.write('\n') 26 f.close() 27 28 address = csv.reader(open(origin_path, 'r', encoding='utf-8')) # 讀取原始文件數據 29 30 # for ad in address: 31 # # print(ad) 32 # print(ad[0]) 33 # print(ad[1]) 34 # print(ad[2]) 35 # print(ad[3]) 36 # print(ad[4]) 37 n = 0 38 akn1 = 0 39 akn2 = 0 40 a = 0 41 while True: 42 try: # 避開錯誤:文件編碼問題、服務器響應超時、 43 for ad in address: 44 if (akn1 < len(AK)) and (akn2 < len(AK)): # 配額是否夠 45 mac_code = str(ad[2]) # 設備序列號 46 try: 47 ori = str(ad[0]) + ',' + str(ad[1]) # 起點 48 des = str(ad[3]) + ',' + str(ad[4]) # 終點 49 ak_drive = AK[akn1] 50 ak_bus = AK[akn2] 51 ak_drive2 = r'&ak=' + ak_drive 52 ak_bus2 = r'&ak=' + ak_bus 53 ori1 = r"?origin=" + ori 54 des1 = r"&destination=" + des 55 # 如下是自駕車 56 tac_type = r'&tactics=11' # 駕車路徑:常規路線 57 # 10不走高速;11常規路線;12距離較短;13距離較短(不考慮路況) 只對駕車有效 58 aurl_drive = url_drive + ori1 + des1 + cod + tac_type + ak_drive2 # 駕車規劃網址 59 res_drive = urlopen(aurl_drive) # 打開網頁 60 cet_drive = res_drive.read() # 解析內容 61 res_drive.close() # 關閉 62 result_drive = json.loads(cet_drive) # json轉dict 63 status = result_drive['status'] 64 print('駕車碼', status) 65 if status == 0: # 狀態碼爲0:無異常 66 m_drive = result_drive['result']["routes"][0]['distance'] # 里程(米) 67 m_drive2 = float(m_drive) # str轉float 68 timesec_drive = result_drive['result']["routes"][0]['duration'] # 耗時(秒) 69 diss_drive = '狀態' + str(status) + ' ' + str(m_drive) + ' ' + str(timesec_drive) # 駕車總 70 elif status == 302 or status == 210 or status == 201: # 302:額度不足;210:IP驗證未經過 71 m_drive2 = 10000 # 賦值(大於5km),即不爬取步行規劃 72 akn1 += 1 73 diss_drive = '狀態' + str(status) + ' break break' 74 else: 75 m_drive2 = 10000 # 賦值(大於5km),即不爬取步行規劃 76 diss_drive = '狀態' + str(status) + ' na na' 77 try: # 當駕車規劃m_drive2爲空的時候,if語句發生錯誤 78 if 0 < m_drive2 < 5000: # 里程低於5千米則爬取步行規劃 79 aurl_walk = url_walk + ori1 + des1 + cod + ak_drive2 # 步行規劃網址 80 res_walk = urlopen(aurl_walk) # 打開網頁 81 cet_walk = res_walk.read() # 解析內容 82 result_walk = json.loads(cet_walk) # json轉dict 83 res_walk.close() # 關閉網頁 84 status_walk = result_walk['status'] # 狀態碼 85 if status_walk == 0: # 狀態正常 86 m_walk = result_walk['result']["routes"][0]['distance'] # 步行距離 87 time_walk = result_walk['result']["routes"][0]['duration'] # 步行時間 88 diss_walk = str(m_walk) + ' ' + str(time_walk) # 步行總 89 else: # 狀態異常 90 diss_walk = 'na na' 91 else: # 里程大於5km則不爬取步行規劃 92 diss_walk = 'na na' 93 except: # 發生錯誤時步行數據也賦值爲na 94 diss_walk = 'na na' 95 pass 96 # 如下是乘車規劃 97 tac_bus = r'&tactics_incity=0' 98 # 市內公交換乘策略 可選,默認爲0 0推薦;1少換乘;2少步行;3不坐地鐵;4時間短;5地鐵優先 99 city_bus = r'&tactics_intercity=0' 100 # 跨城公交換乘策略 可選,默認爲0 0時間短;1出發早;2價格低; 101 city_type = r'&trans_type_intercity=2' 102 # 跨城交通方式策略 可選,默認爲0 0火車優先;1飛機優先;2大巴優先; 103 ori2 = r"&origin=" + ori 104 des2 = r"&destination=" + des 105 aurl_bus = url_bus + ori2 + des2 + tac_bus + city_bus + city_type + ak_bus2 106 res_bus = urlopen(aurl_bus) 107 cet_bus = res_bus.read() 108 res_bus.close() 109 result_bus = json.loads(cet_bus) 110 status = result_bus['status'] 111 print('乘車碼', status) 112 # -------------------------------------- 113 # if status == 0: 114 # rsls = result_bus['result']['routes'] 115 # if rsls == []: # 無方案時狀態也爲0,但只返回一個空list 116 # diss_bus = '狀態' + str(status) + ' ' + '無公交方案' 117 # else: 118 # m_bus = result_bus['result']['routes'][0]['distance'] # 乘車路線距離總長(米) 119 # time_bus = result_bus['result']['routes'][0]['duration'] # 乘車時間(秒) 120 # cost_bus = result_bus['result']['routes'][0]['price'] # 乘車費用(元) 121 # diss_bus = '狀態' + str(status) + ' ' + str(m_bus) + ' ' + str(time_bus) + ' ' + str(cost_bus) 122 # elif status == 302 or status == 210 or status == 201: 123 # akn2 = akn2 + 1 124 # diss_bus = '狀態' + str(status) + ' ' + '更換AK斷點' 125 # else: # 其餘類型狀態碼(服務器錯誤) 126 # diss_bus = '狀態' + str(status) + ' ' + '服務器錯誤' 127 # ----------------------------------------------- 128 # 彙總數據 129 diss = mac_code + ' ' + str(ori) + ' ' + str( 130 des) + ' ' + diss_drive + ' ' + diss_walk #+ ' ' + diss_bus 131 with open(result_path, 'a', encoding='utf-8') as f: 132 f.write(diss) 133 f.write('\n') 134 f.close() 135 n += 1 136 print('第' + str(n) + '條已完成') 137 except: 138 time.sleep(3) 139 diss_wrong = str(mac_code) + '未知錯誤' 140 with open(result_path, 'a', encoding='utf-8') as f: 141 f.write(diss_wrong) 142 f.write('\n') 143 f.close() 144 continue 145 else: 146 print('配額不足!') 147 break 148 except: 149 time.sleep(3) 150 print('未知錯誤') 151 with open(result_path, 'a', encoding='utf-8') as f: 152 f.write('未知錯誤') 153 f.write('\n') 154 f.close() 155 continue 156 print('程序已中止運行') 157 break # 跑完數時break打斷while循環,for循環的話這裏很差定義循環條件
最終根據生成的數據做圖以下:服務器
附錄:app
1.數據獲取(藉助攜程網爬取鄭州市以好評度優先的方式爬取全部星級酒店信息)
1 import requests 2 import random 3 from bs4 import BeautifulSoup 4 import time 5 import csv 6 import json 7 import re 8 import pandas as pd 9 import numpy as np 10 11 pd.set_option('display.max_columns', 10000) 12 pd.set_option('display.max_rows', 10000) 13 pd.set_option('display.max_colwidth', 10000) 14 pd.set_option('display.width',1000) 15 16 # Beijing 5 star hotel list url 17 five_star_url = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx" 18 filename = "data/star hotel list.csv" 19 20 def Scrap_hotel_lists(): 21 """ 22 It aims to crawl the 5 star hotel lists in Beijing and save in a csv file. 23 """ 24 headers = { 25 "Connection": "keep-alive", 26 "origin": "http://hotels.ctrip.com", 27 "Host": "hotels.ctrip.com", 28 "referer": "https://hotels.ctrip.com/hotel/zhengzhou559", 29 "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36", 30 "Content-Type":"application/x-www-form-urlencoded; charset=utf-8" 31 } 32 33 34 35 id = [] 36 name = [] 37 hotel_url = [] 38 address = [] 39 score = [] 40 41 # 8 pages 42 for page in range(1,8): 43 44 data = { 45 "StartTime": "2019-09-08", # The value depends on the date you want to scrap. 46 "DepTime": "2019-09-18", 47 "RoomGuestCount": "0,1,2", 48 "cityId": 559, 49 "cityPY": " zhengzhou", 50 "cityCode": "0371", 51 "cityLat": 34.758044, 52 "cityLng": 113.673121, 53 "page": page, 54 "star": "3", 55 "orderby": 3 56 } 57 html = requests.post(five_star_url, headers=headers, data=data) 58 59 # print(html.text) 60 j= json.loads(html.text.replace("\洛陽","洛陽")) 61 #hotel_list = html.json()["totalMsg"] 62 hotel_list = j["hotelPositionJSON"] 63 64 for item in hotel_list: 65 id.append(item['id']) 66 name.append(item['name']) 67 hotel_url.append(item['url']) 68 address.append(item['address']) 69 score.append(item['score']) 70 71 time.sleep(random.randint(3,5)) 72 hotel_array = np.array((id, name, score, hotel_url, address)).T 73 list_header = ['id', 'name', 'score', 'url', 'address'] 74 array_header = np.array((list_header)) 75 hotellists = np.vstack((array_header, hotel_array)) 76 with open(filename, 'a', encoding="utf-8-sig", newline="") as f: 77 csvwriter = csv.writer(f, dialect='excel') 78 csvwriter.writerows(hotellists) 79 80 81 def hotel_detail(hotel_id): 82 """ 83 It aims to scrap the detailed information of a specific hotel. 84 """ 85 headers = {"Connection": "keep-alive", 86 "Accept-Language": "zh-CN,zh;q=0.9", 87 "Cache-Control": "max-age=0", 88 "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", 89 "Host": "hotels.ctrip.com", 90 "If-Modified-Since": "Thu, 01 Jan 1970 00:00:00 GMT", 91 "Referer": "http://hotels.ctrip.com/hotel/2231618.html", 92 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 93 "Chrome/69.0.3497.92 Safari/537.36" 94 } 95 96 basic_url = "http://hotels.ctrip.com/Domestic/tool/AjaxHote1RoomListForDetai1.aspx?hotel=" 97 url = basic_url + str(hotel_id) 98 99 r = requests.get(url, headers=headers) 100 # Response is a json object. 101 html = r.json()['html'] 102 soup = BeautifulSoup(html, "lxml") 103 rooms = soup.findAll('td', attrs={"class": "child_name J_Col_RoomName"}) 104 105 RoomID = [] 106 RoomName = [] 107 LowPrice = [] 108 RoomSize = [] 109 RoomLevel = [] 110 IsAddBed = [] 111 BedSize = [] 112 CustomerNum = [] 113 114 # Regex Pattern 115 baseroom_pattern = re.compile(r'<[^>]+>') # r'<[^>]+>' 116 117 for idx in range(len(rooms)): 118 if rooms[idx].has_attr(key='data-baseroominfo'): 119 room_info_str = rooms[idx]['data-baseroominfo'] 120 room_info_json = json.loads(room_info_str) 121 RoomID.append(str(room_info_json["RoomID"])) 122 RoomName.append(room_info_json["RoomName"]) 123 LowPrice.append(room_info_json["LowPrice"]) 124 125 baseroom_info = room_info_json["BaseRoomInfo"] 126 # print(type(baseroom_info)) 127 # <class 'str'> 128 remove_tag = baseroom_pattern.sub("", baseroom_info) 129 RoomDetailInfo = remove_tag.split("|") 130 if len(RoomDetailInfo) == 4: 131 RoomDetailInfo.insert(3, None) 132 133 RoomSize.append(RoomDetailInfo[0]) 134 RoomLevel.append(RoomDetailInfo[1]) 135 BedSize.append(RoomDetailInfo[2]) 136 IsAddBed.append(RoomDetailInfo[3]) 137 CustomerNum.append(RoomDetailInfo[4]) 138 else: 139 continue 140 141 RoomInfo = np.array((RoomID, RoomName, LowPrice, RoomSize, RoomLevel, BedSize, IsAddBed, CustomerNum)).T 142 # Create a DataFrame object 143 # print(RoomInfo) 144 column_name = ['RoomID', 'RoomName', 'LowPrice', 'RoomSize', 'RoomLevel', 'BedSize', 'IsAddBed', 'CustomerNum'] 145 df = pd.DataFrame(data=RoomInfo, columns=column_name) 146 print(df) 147 148 149 if __name__ == "__main__": 150 151 # # 1. Scrap 5 star hotel list in Beijing 152 Scrap_hotel_lists() 153 154 # 2. Scrap the detailed hotel information 155 df = pd.read_csv(filename, encoding='utf8') 156 print("1. Beijing 5 Star Hotel Lists") 157 print(df) 158 hotelID = df["id"] 159 print('\n') 160 161 while True: 162 print("2.1 If you find to search the detail hotel information, please input the hotel index in the DataFrame.") 163 print("2.2 If you want to quit, input 'q'.") 164 165 print("Please input the Parameter: ") 166 input_param = input() 167 if input_param.isnumeric(): 168 hotel_index = int(input_param) 169 if 0 <= hotel_index <= 170: 170 print("3. The detail information of the Hotel:") 171 hotel_detail(hotelID[hotel_index]) 172 else: 173 print('Hotel Index out of range! ') 174 print('Remember: 0 <= Hotel Index <= 170') 175 print('Please input again.') 176 continue 177 elif input_param == 'q': 178 print('See you later!') 179 break 180 else: 181 print('Invalid Input!') 182 print('\n') 183 continue
2.根據生成數據繪製酒店信息雲圖
from pyecharts import WordCloud import random name1 =hotel_list2 random_list =[296, 630, 。。。] # 也能夠經過一下三行生成隨機整數列表 # for i in range(len(name1)): # #隨機產生len(name1)個300-10000整數 # random_list.append(random.randint(300,800)) # print('生成的隨機整數列表爲:\n',random_list) value =random_list wordcloud =WordCloud(width=1300, height=800) wordcloud.add("酒店信息", name1, value, word_size_range=[10,20], shape='pentagon') wordcloud.show_config() wordcloud.render()
Note:以上純屬娛樂學習之用。echarts