Python-新手爬取安居客新房房源

時間 2019-11-09

標籤 python 新手安居新房房源欄目 Python 简体版

原文原文鏈接

新手，整個程序還有不少瑕疵。json

1.房源訪問的網址爲城市的拼音+後面統一的地址。須要用到xpinyin庫app

2.用了2種解析網頁數據的庫bs4和xpath（先學習的bs4，學了xpath後部分代碼改爲xpath）ide

遇到的問題：學習

1.在解析頁面時，鼠標點擊過位置的div的class屬性值有變化，沒有注意，致使浪費很長時間。下圖，點擊後的div的class屬性值的空格沒有了。url

2.基礎學習還要增強，字符串和列表基本命令和轉換使用不熟練。
spa

3.沒有真正理解 new_list = [i for i in list if i != ''] #刪除列表裏的空值這條語句的意思，只是照搬過來用。
code

import urllib.request import urllib.parse from bs4 import BeautifulSoup from xpinyin import Pinyin import time from lxml import etree import json class anjuk_spider(object): url = '.fang.anjuke.com/loupan/all/'
    def __init__(self,city): self.city = city #建一個空列表，存放全部的房源信息
        self.items = [] #把輸入的城市轉換成拼音
    def citypinyin(self,city): p = Pinyin() return p.get_pinyin(self.city,'') #構建請求體
    def handle_request(self,city_url,page_num): city_url = city_url + 'p' + str(page_num + 1) + '/' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } request = urllib.request.Request(city_url, headers=headers) return request #解析價格
    def price(self,item): temp = item.xpath('.//a[@class="favor-pos"]/p[1]/text()')[0] if temp == '售價待定': temp = temp + " 周邊均價：" + item.xpath('.//a[@class="favor-pos"]/p/span/text()')[0] + "元/㎡"
        else: temp = temp + item.xpath('.//a[@class="favor-pos"]/p/span/text()')[0] + "元/㎡"
        return temp #解析戶型
    def huxing(self,item): temp = item.xpath('.//a[@class="huxing"]//text()') # 第1步，先創建個空列表。整理爬下來的列表,去掉空格，換行等
        list = [] for item in temp: item = item.replace("\n", "").replace("\t", "").replace("\xa0", "").replace("/", "").replace(" ", "") list.append(item) # 刪除列表裏的空值
        new_list = [i for i in list if i != ''] # 把整理好的列表轉換成新字符串
        item_huxing = "/".join(new_list) return item_huxing #解析位置
    def address(self,item): temp = item.xpath('.//a[@class="address"]//span[@class="list-map"]/text()')[0] end_item_address = temp.find("]") item_address = temp[end_item_address + 2:] return item_address #解析數據
    def parse_content(self,content): #利用xpath提取數據
        tree = etree.HTML(content) div_list = tree.xpath('//div[@class="key-list imglazyload"]//div[@class="item-mod "]') for item in div_list: try: #小區名稱
                item_name = item.xpath('.//h3//span[@class="items-name"]/text()')[0] #小區位置
                item_address = self.address(item) #小區戶型
                item_huxing = self.huxing(item) # 小區價格
                item_Price = self.price(item) #把數據存放到字典中
                dict = { "小區名稱":item_name, "小區位置":item_address, "戶型面積":item_huxing, "價格":item_Price, } #每條字典更新到列表中
 self.items.append(dict) print("小區名稱：%s | 位置：%s" %(item_name,item_address)) print("%s｜價格：%s" %(item_huxing,item_Price)) print("******************************************************************************") time.sleep(1) # print("小區名稱：%s | 小區位置：%s |價格：%s" % (item_name,item_address,item_Price))
            except: print("讀取出現問題！") def run(self): # 先要取得請求網址,請求網址的組合爲城市拼音+上後面的網址
        citypinyin = self.citypinyin(self.city) #輸入的城市請求網址
        city_url = 'https://' + citypinyin + self.url headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } #請求體
        city_request = urllib.request.Request(city_url,headers = headers) # 發送請求
        content = urllib.request.urlopen(city_request).read().decode() #處理響應數據，提取當前房源數量，判斷應讀取幾頁數據
        #建立bs4對象
        soup = BeautifulSoup(content,'lxml') temp_page = int(soup.select('.list-results > .key-sort > .sort-condi > span > em')[0].text) #用取得的房源數量，計算須要讀取多少頁數據
        #每頁有60條數據，若是能整除，那就有總數/60頁，若是不能整除，那麼就總數/60取整+1頁。
        if temp_page <= 60: page = 1
        if (temp_page % 60) == 0: page = temp_page / 60
        else: page = (temp_page // 60) + 1

        #發送請求
        for page_num in range(page): #輸入網址，頁數，反回請求數據
            request = self.handle_request(city_url,page_num) content = urllib.request.urlopen(request).read().decode() #解析數據
 self.parse_content(content) time.sleep(2) #轉成json格式
            string = json.dumps(self.items,ensure_ascii=False) #將數據寫入文本中
            with open('安居客房價.txt','w',encoding='utf-8') as f: f.write(string) print("存入文本！") def main(): city = input("請輸入城市名字：") #建立對象，開始爬取數據
    spider = anjuk_spider(city) spider.run() if __name__ == '__main__': main()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。