Python爬蟲安居客房價信息（並利用百度地圖API查詢座標）

時間 2020-06-09

標籤 python 爬蟲安居房價信息利用百度地圖 api 查詢座標欄目 Python 简体版

原文原文鏈接

　　代碼功能簡述：基於Python爬取安居客小區平均房價數據，網址：https://shantou.anjuke.com/community/p40/。獲取到小區名稱後，利用百度地圖API查詢其地理座標。最後須要獲得的是城市名、小區名稱、經緯度、平均房價。

安居客網站頁面如上圖，只需獲取紅框內信息。html

完整代碼以下：
# -*- coding:utf-8 -*- #這個爬蟲是獲取安居客小區數據(須要header)
import requests from bs4 import BeautifulSoup from urllib2 import urlopen, quote import json import sys import time from xpinyin import Pinyin　　#導入拼音模塊 reload(sys) sys.setdefaultencoding("utf-8") class ajkxq:　　#建立一個類，類中有兩個函數，分別是獲取小區信息的getInfo()和獲得小區名稱後查詢其經緯度的getlnglat()，其中getInfo()會調用getlnglat()函數

    def getlnglat(self,address): """根據傳入地名參數獲取經緯度""" url = 'http://api.map.baidu.com/geocoder/v2/' output = 'json'　　#輸出結果能夠是json也能夠是其餘類型 ak = '你本身的密鑰' add = quote(str(address))#有時候quote會出錯KeyError，要先把quote裏面轉成字符串類型
        uri = url + '?' + 'address=' + add + '&output=' + output + '&ak=' + ak　　
　　　　#構建百度地圖API查詢須要的uri，uri與URL的區別見：https://www.zhihu.com/question/21950864 
　　　　#下面這塊代碼是爲了防止由於請求次數過多而產生的錯誤：urllib2.URLError: <urlopen error [Errno 10060] >
　　　　#若是出現該錯誤，就讓程序暫停10秒（time.sleep(10)），繼續從新獲取該小區的座標。
　　　　#timeout=30是防止由於網絡緣由出現延遲錯誤
 maxNum=5
        for tries in range(maxNum): try: req = urlopen(uri,timeout=30)#設置timeout30秒，防止百度屏蔽,若是被屏蔽就等30秒
            except: if tries < (maxNum - 1): time.sleep(10) continue
                else: print("Has tried %d times, all failed!", maxNum) break res = req.read().decode() temp = json.loads(res) lat = temp['result']['location']['lat'] lng = temp['result']['location']['lng'] return lat, lng 
　　#獲取小區信息的函數 def getInfo(self, page, fh, city, citypy): url = 'https://'+citypy+'.anjuke.com/community/p' + str(page)+'/' headers = {　　#因爲安居客網站的反爬蟲，這裏必需要設置header 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } response = requests.get(url, headers=headers,timeout=30)
　　　　#如下是解析網頁的過程 bs = BeautifulSoup(response.text, 'html.parser') lis = bs.find_all('div', class_='li-itemmod') for li in lis: infos = li.find_all('div', class_='li-info') for info in infos: titles = info.find_all('h3') for title in titles: strTitle = title.get_text() print city,page,strTitle  li_sides = li.find_all('div', class_='li-side') strPrice = '暫無均價'
            for li_side in li_sides: ps = li_side.find_all('p') for p in ps: strongs=p.find_all('strong') for price in strongs: strPrice = price.get_text() print strPrice address = city + strTitle  # 在前面加上城市變量，限定查詢城市
                try: lat, lng = test.getlnglat(address) print lat, lng fh.write(city.strip()+','+strTitle.strip() + ',' + str(lat).strip() + ','  + str(lng).strip() +','+ strPrice.strip() + '\n') # strip()是去掉每行後面的換行符，只有str類型才能用strip()
                except KeyError:  # 處理未知異常
                    print("Get AError") if __name__ == '__main__': print "開始爬數據，請稍等..." start_time = time.time() tup1 = [　　#設置元組，循環取元組中的元素
            '廣州','深圳', '佛山', '東莞', '珠海', '中山', '惠州', '肇慶', '江門', '清遠', '汕頭','梅州', '河源', '揭陽', '潮州', '汕尾', '韶關', '陽江', '茂名', '湛江', '雲浮'
 ]; for i in tup1: city = str(i)#構建拼音，這樣每次就只要改這裏的城市名就能夠了
        city = unicode(city, 'utf-8') citypy = Pinyin()　　#實例化拼音模塊的函數 citypy=str(citypy.get_pinyin(city, ''))　　#還有其餘轉換拼音的方法，好比在兩個字之間加符號等 print citypy stradd='C://Users//Administrator//PycharmProjects//···//guangdong.txt' fh = open(stradd, "a") #輸入的文件編碼格式必需要與上面type=sys.setdefaultencoding( "utf-8" )一致，即也必須是utf-8
        for page in range(1, 101): test=ajkxq()#實例化對象
            test.getInfo(page, fh, city, citypy)#方法必須由實例調用
 fh.close() #time.sleep(3)#爬完一個城市後暫停3秒
    end_time = time.time() print "數據爬取完畢，用時%.2f秒" % (end_time - start_time)