python對【某房某天下】二手房數據採集保存到Excel

時間 2019-11-08

標籤 python 某房某天下二手房數據採集保存 excel 欄目 Python 简体版

原文原文鏈接

前言

文的文字及圖片來源於網絡,僅供學習、交流使用,不具備任何商業用途,版權歸原做者全部,若有問題請及時聯繫咱們以做處理。 html

做者：你想要極客貓喵bash

PS：若有須要Python學習資料的小夥伴能夠加點擊下方連接自行獲取網絡

http://note.youdao.com/noteshare?id=3054cce4add8a909e784ad934f956cef複製代碼

代碼實現

一、定義的請求函數app

設置隨機請求頭，隨機代理返回 respose.text對象dom

def get_requests (url):

    #user_Agent列表
    user_agent_list = [
        "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
        "Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)",
        "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
        "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
        "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)",
        "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)"
    ]
    # #ip地址列表
    # ip_list= ['60.216.101.46:59351', '117.69.201.116:9999', '113.128.10.77:9999']
    #
    # #產生隨機ip
    # random_proxies = {
    # 'http':random.choice(ip_list)
    # }
    #產生一個隨機user-agent
    random_header = {
    #從上面的列表上隨機取一個
        "User-Agent":random.choice(user_agent_list),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }
    #proxies = random_proxies ,
    #使用隨機ip與請求頭
    response = requests.get(url,headers=random_header)
    return response.text複製代碼

2.get_fakeurl(start,end)函數ide

返回帶href屬性的fake_url列表，以後會對這個列表進行循環請求獲得=======重定向後---帶有參數的真正的url函數

def get_fakeurl(start,end):

    fakeurl_list = []
    for i in range(start,end):
        new_url = domain +'/house/i3'+ str(i)+'/'
        response_re = get_requests(new_url)
        selector = etree.HTML(response_re,etree.HTMLParser())
        href_list = selector.xpath('//dl[@class="clearfix"]//dd//h4/a/@href')
        for href_url in href_list:
            fake_detail_url = domain + href_url
            fakeurl_list.append(fake_detail_url)
    return fakeurl_list複製代碼

3.得到重定向後的詳情頁url學習

def get_realurl(fakeurl):

    try:
        detail_text = requests.get(fakeurl).text
        # re提取url參數部分，並拼接出url
        real_url = fakeurl + '?' + re.findall(r't3=\'(.*?)\'',detail_text)[0]
        # print(real_url)
        return real_url

    except:
        print("信息丟失")複製代碼

4.對詳情頁url的頁面進行解析ui

並利用xpath進行匹配，抓取存入字典中最後統一存入列表中，讓pandas處理，存入excel中url

def parse_infor(real_url):

    dic = {}
    # print(detail_url)
    dic["url"] = real_url

    response = get_requests(real_url)
# print(response)

    selector = etree.HTML(response,etree.HTMLParser())
    title = selector.xpath('//div[@class="title rel"]/h1/text()')[0]
    # print(title)
    price = selector.xpath('//div[@class="trl-item_top"]/div/i/text()')[0]
# print(price)
    # title = "".join(title.split())
    # print(title)
    dic["標題"] = "".join(title.split())
    dic["價格"] = price

    infors1 = selector.xpath('//div[@class="tr-line clearfix"]//div/div[@class="tt"]/text()')
    lab1 = selector.xpath('//div[@class="tr-line clearfix"]//div/div[@class="font14"]/text()')

    dic.update(dict(zip(lab1[:6],infors1[:6])))
    dic[lab1[-1]] = "".join(infors1[-1].split())

    infors2 = selector.xpath('//div[@class="cont clearfix"]/div[@class="text-item clearfix"]/span[@class="rcont"]/text()')
    lab2 = selector.xpath('//div[@class="cont clearfix"]/div[@class="text-item clearfix"]/span[@class="lab"]/text()')
    dic.update(dict(zip(lab2,infors2)))
    dic[lab2[-1]] = "".join(infors2[-1].split())

    return dic複製代碼

5.實現組織上面定義的函數功能，達到數據成功run

def get_allinfor(fakeurl_list):
    house_arry = []
    for fakeurl in fakeurl_list:

        try:
            real_url = get_realurl(fakeurl)

            dic = parse_infor(real_url)

            house_arry.append(dic)

            print("-------------------恭喜你，此房源信息爬取成功！-------------------")
        except:
            print("！！！！！未爬取到，信息丟失！！！！！")
            pass
        time.sleep(2)

    return house_arry



def papline():
    pass複製代碼

6.主函數，爬蟲的啓動器

if __name__ == '__main__':
    
    domain = 'http://huaibei.esf.fang.com/'
    start = 1
    end = 16
    fakeurl_list = get_fakeurl(start,end)

    houseinfor = get_allinfor(fakeurl_list)

    df = pd.DataFrame(houseinfor)

    df.to_excel("huaibei_house.xlsx",index=False)複製代碼