在網上找了不少關於爬取百度POI的文章,可是對「全量」的作法並無獲得最終的解決方案,本身寫了一個,但仍是不能實現全量POI抓取,可以達到至少50%的信息抓取。
注意:這裏所指「全量」是可以達到100%的POI信息獲取。
如下是本身寫的代碼可直接複製粘貼使用,只針對重慶主城區的。mysql
# -*- coding: utf-8 -*-
#-----------------採集全重慶範圍內的POI興趣點-------------------
import requests
import json
import xlrd
import pandas as pd
import pymysql
import timesql
def sleeptime(day=1): # 默認參數智能放最後
return day * 24 * 3600數據庫
def get_Region():
#'重慶市渝中區',
region = ['重慶市渝中區','重慶市江北區','重慶市渝北區','重慶市沙坪壩區','重慶市九龍坡區',
'重慶市南岸區','重慶市巴南區','重慶市北碚區','重慶市大渡口區']json
lng_lat = {'重慶市渝中區':'29.525375,106.494032,29.582745,106.602727', #重慶市渝中區的矩形區域左下角和右上角的經緯度(可參看百度文檔對矩形區域的解釋)
'重慶市江北區':'29.571878,106.440282,29.689841,106.896587', #左下:106.440282,29.571878
'重慶市渝北區':'29.575192,106.434211,30.051651,106.993128', #右上106.993128,30.051651
'重慶市沙坪壩區':'29.497572,106.261671,29.755899,106.51607', #106.275683,29.497572
'重慶市九龍坡區': '29.248442,106.275683,29.574961,106.553261',
'重慶市南岸區':'29.476615,106.542511,29.614757,106.803882',
'重慶市巴南區':'29.118686,106.520931,29.735054,107.062741',
'重慶市北碚區':'29.66131,106.29192,30.109513,106.720972',
'重慶市大渡口區':'29.344094,106.403174,29.509104,106.532859'}api
#'重慶市渝中區':'29.525375,106.494032,29.582745,106.602727', #右上 106.597337,29.578724
# '重慶市江北區':'29.571878,106.440282,29.689841,106.896587', #左下:106.440282,29.571878
# '重慶市渝北區':'29.575192,106.434211,30.051651,106.993128', #右上106.993128,30.051651
# '重慶市沙坪壩區':'29.497572,106.261671,29.755899,106.51607', #106.275683,29.497572
#'重慶市九龍坡區':'29.248442,106.275683,29.574961,106.553261'#, #106.275683,29.248442,106.553261,29.574961
# '重慶市南岸區':'29.476615,106.542511,29.614757,106.803882',
# '重慶市巴南區':'29.118686,106.520931,29.735054,107.062741',
# '重慶市北碚區':'29.66131,106.29192,30.109513,106.720972',
# '重慶市大渡口區':'29.344094,106.403174,29.509104,106.532859'}#各行政區左下右上經緯度
return region,lng_latui
#
# 左下角: [29.547183, 106.545379]
# 右上角: [29.5560735, 106.559716]url
def get_Keyword():
#path =r'E:\Project\電子圍欄\Data\興趣點.xlsx'
#data = pd.read_excel(path,sheet_name='keyword2')
data = ['酒店','公司']
return data.net
def get_ak():#改爲你本身的ak
ak = ['xYhFWBgDCF12nPU5v2pMue5uN14d****',
'nryXXcwxdwGws64qmB9jFE29yY6P****',
'tP4RTTuWu0Rds81Hvm0Mn9QXajBs****',
'GP12Wh7S1pM1oa4oNyib7xByol5X****']
return akexcel
def get_Url(data,region,lng_lat,aks,connect):blog
url_POI = 'http://api.map.baidu.com/place/v2/search?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ak_num = 0
ak = aks[ak_num]
payload = {'output': 'json'}
payload['ak'] = ak
payload['city_limit'] = 'true' #區域數據限制(只返回指定區域的信息)
payload['scope'] = '2' # 返回詳細信息,1或空爲基本信息
payload['page_size'] = '20' #每一頁最多返回20個興趣點
for i in range(len(region)):
payload['region'] = region[i] #指定行政區域名稱
trapeze = lng_lat[region[i]].split(',') #指定行政區的左下右上經緯度 trapeze 經緯度格
#如下是將一個行政區(大矩形)劃分爲多個矩形網格,獲取大於400個興趣點的返回值
left_bottom = [trapeze[0],trapeze[1]] #左下角座標
right_top = [trapeze[2],trapeze[3]] #右上角座標
part_n = 20; #網格切割數量,20*20=400 可根據行政區域的大小調整網格數量
x_item = (float(right_top[0]) - float(left_bottom[0])) / part_n; #緯度平均分割大小
print(x_item)
y_item = (float(right_top[1]) - float(left_bottom[1])) / part_n; #經度平均分割大小
print(y_item)
for key in range(len(data['keyword'])):
payload['query'] = data['keyword'][key] #指定查詢關鍵字
#print(data['keyword'][key])
for ii in range(part_n): # 遍歷橫向網格
for jj in range(part_n): # 遍歷縱向網格
#如下left_bottom_part、right_top_part認真理解
left_bottom_part = [float(left_bottom[0]) + ii * x_item, float(left_bottom[1]) + jj * y_item]; #切片的左下角座標
right_top_part = [float(right_top[0]) - (part_n-1-ii) * x_item, float(right_top[1]) - (part_n-1-jj) * y_item]; # 切片的右上角座標
for page_num in range(20): #接口最多返回20頁
payload['page_num'] = page_num
payload['bounds']=str(left_bottom_part[0]) + ',' + str(left_bottom_part[1]) + ',' + \
str(right_top_part[0])+','+str(right_top_part[1])
#print(payload)
try:
content = requests.get(url_POI,params=payload,headers=headers).json()
#print(content)
if content['status'] == 0: #可以正常返回值
if content['results']:
storeTomysql(content, data['keyword'][key], connect,region[i])#存放數據庫
else:
break
else:
if content['status'] == 302: # 若是ak配額用完
print('第' + str(ak_num) + '個ak配額用完,自動調用下一個····')
ak_num = ak_num+1
if ak_num>(len(aks)-1):
print('今天ak已用完,明天再繼續吧。。。')
time.sleep(sleeptime()) #程序休眠24小時
ak_num = 0
ak = aks[ak_num]
payload['ak']=ak
content = requests.get(url_POI, params=payload, headers=headers).json()
if content['status']==0:
if content['results']:
storeTomysql(content,data['keyword'][key],connect,region[i])
else:
break #爲空則跳出本層循環
else:pass
else:
ak = aks[ak_num]
payload['ak'] = ak
content = requests.get(url_POI, params=payload, headers=headers).json()
if content['status']==0:
if content['results']:
storeTomysql(content,data['keyword'][key],connect,region[i])
else:
break
else:pass
else:
pass #其餘錯誤
# print(content)
except Exception as e:
print(e)
def mysql_Detail():
# 使用的mysql數據庫
connect = pymysql.Connect(host='localhost', port=3306, user='root', passwd='更改成你的密碼',
db='addr_trans', charset='utf8')
return connect
def storeTomysql(content,Key_word,connect,region):
#在 content 中提取詳細信息
# print('test01')
cursor = connect.cursor()
# if content['results']:
for h in range(len(content['results'])): # 遍歷返回地點
name = content['results'][h]['name']
lng = content['results'][h]['location']['lng']
lat = content['results'][h]['location']['lat']
if 'area' in content['results'][h].keys():
area = content['results'][h]['area']
else:
area = None
if 'address' in content['results'][h].keys():
address = content['results'][h]['address']
else:
address = None
if 'province' in content['results'][h].keys():
province = content['results'][h]['province']
else:
province = None
if 'city' in content['results'][h].keys():
city = content['results'][h]['city']
else:
city = None
if 'telephone' in content['results'][h].keys():
telephone = content['results'][h]['telephone']
else:
telephone = None
if 'uid' in content['results'][h].keys():
uid = content['results'][h]['uid']
else:
uid = None
if 'street_id' in content['results'][h].keys():
street_id = content['results'][h]['street_id']
else:
street_id = None
if content['results'][h]['detail'] == 1:
if 'type' in content['results'][h]['detail_info'].keys():
type0 = content['results'][h]['detail_info']['type']
else:
type0 = None
if 'tag' in content['results'][h]['detail_info'].keys():
tag = content['results'][h]['detail_info']['tag']
else:
tag = None
else:
type0 = None
tag = None
if area == region[3:]:#區域相符才存數據
try:
sql = "insert into poi_full(keyword,name0,lng,lat,address,province,city,area,telephone,uid,street_id,type0,tag) " \
"VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"
data0 = (Key_word,name,lng,lat,address,province,city,area,telephone,uid,street_id,type0,tag)
cursor.execute(sql % data0)
connect.commit()
except Exception as e:
print(e)
else:pass
cursor.close()
if __name__ == "__main__": aks = get_ak() keyword = get_Keyword() region,lng_lat = get_Region() connect = mysql_Detail() get_Url(keyword, region, lng_lat,aks,connect) connect.close()