python3爬蟲-經過requests獲取安居客房屋信息

import requests
from fake_useragent import UserAgent
from lxml import etree
from http import cookiejar
import re, time
import pymysql
import random
from requests.exceptions import Timeout

ua = UserAgent()

session = requests.Session()


class MyException(Exception):
    '''自定義一個異常'''

    def __init__(self, status, msg):
        self.status = status
        self.msg = msg
        super().__init__()


class AnKeJu:
    '''
    北京新房   https://bj.fang.anjuke.com/
    北京二手房 https://beijing.anjuke.com/sale/
    北京租房   https://bj.zu.anjuke.com/
    想要爬取不一樣城市的信息,只需將bj改成對應的城市信息
    '''

    # 原本想寫下登錄的,可是他好像沒有密碼登錄,只有手機驗證碼。我說的普通用戶
    is_login = False
    city_dict = {}
    conn = None
    proxies = None

    def __init__(self):
        self.session = session
        self.session.headers = {
            "user-agent": ua.random
        }
        self.session.cookies = cookiejar.LWPCookieJar(filename="./cookies.txt")

        if not self.conn:
            self.conn = pymysql.connect(host="127.0.0.1",
                                        port=3306,
                                        user="root",
                                        db="ankeju")
            self.conn.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
        self.__get_all_city()

    def __response_to_xml(self, response):
        '''將response處理爲xml格式數據'''
        xml = etree.HTML(response.text)
        return xml

    def __get_all_city(self):
        api = "https://www.anjuke.com/sy-city.html"
        headers = self.session.headers.copy()
        response = self.session.get(api, headers=headers)
        xml = self.__response_to_xml(response)
        city_xpath_list = xml.xpath("//div[@class='city_list']")[0:-1]
        city_name_list = [city_xpath.xpath("a/text()") for city_xpath in city_xpath_list]
        city_url_list = [city_xpath.xpath("a/@href") for city_xpath in city_xpath_list]
        city_dict_value = []
        city_dict_key = []

        # 這裏真不知道怎麼取變量名了
        # city_url_list它的格式是list套多個list,由於這個頁面是按照A,B,C,D...這樣排的
        for letter_url_list in city_url_list:
            for city_url in letter_url_list:
                shorthand_city = re.findall(r"//(.*?)\.", city_url)[0]
                city_dict_value.append(shorthand_city)

        for aa_list in city_name_list:
            for city_name in aa_list:
                city_dict_key.append(city_name)

        self.city_dict = {k: v for k, v in zip(city_dict_key, city_dict_value)}

    def __is_exist_next_page(self, response):
        '''判斷二手房當前頁面是否存在下一頁'''
        xml = self.__response_to_xml(response)
        next_page_url = xml.xpath("//*[@class='aNxt']/@href")
        if next_page_url:
            return next_page_url[0]
        return False

    def __get_html_information_v2(self, response):
        '''獲取二手房當前頁面的房子信息'''
        xml = self.__response_to_xml(response)

        # 檢測是否是訪問驗證的頁面

        if xml.xpath("//*[@id='verify_page']"):
            # 出現了爬蟲檢測
            # 只要你的ip地址,都會出現訪問驗證這個頁面,我也不清楚我用了代理,仍是被檢測出了ip問題
            # 那只有調用selenium去進行破解了
            pass

        # 獲取到房子的信息
        li_xpath_list = xml.xpath("//*[@id='houselist-mod-new']//li[@class='list-item']")
        for li_xpath in li_xpath_list:
            house_info = []
            # 獲取房子的img地址
            house_img_url = li_xpath.xpath("div[@class='item-img']/img/@src")[0]
            house_info.append(house_img_url)

            # 獲取介紹房子的title
            house_title = li_xpath.xpath("div[@class='house-details']/div[1]/a/text()")[0].strip()
            house_info.append(house_title)
            # 獲取房子詳情信息
            house_details = li_xpath.xpath("div[@class='house-details']/div[2]")[0].xpath("string(.)").strip().split(
                "")[0]
            house_info.append(house_details)
            # 獲取房子地址 可能會存在地址沒有的請求
            try:
                house_address = li_xpath.xpath("div[@class='house-details']/div[3]/span/@title")[
                                    0].strip() or "暫時沒有地址信息"
            except IndexError:
                house_address = "暫時沒有地址信息"
            house_info.append(house_address)
            # 獲取房子的總價錢
            house_total_price = li_xpath.xpath("div[@class='pro-price']/span[1]")[0].xpath("string(.)").strip()
            house_info.append(house_total_price)
            # 獲取房子的房價
            house_price = li_xpath.xpath("div[@class='pro-price']/span[2]/text()")[0]
            house_info.append(house_price)
            # 獲取房子標籤
            house_tags = li_xpath.xpath("div[@class='house-details']/div[@class='tags-bottom']")[0].xpath(
                "string(.)").strip() or "暫無房子標籤信息"

            house_info.append(house_tags)
            yield house_info

    def __get_html_information_v1(self, response):
        '''獲取新房當前頁面的房子信息'''
        xml = self.__response_to_xml(response)
        if xml.xpath("//*[@id='verify_page']"):
            pass

        div_xpath_list = xml.xpath("//div[@class='key-list imglazyload']//div[@class='item-mod ']")

        for div_xpath in div_xpath_list:
            house_info_list = []
            # 獲取房子的img地址
            house_img_url = div_xpath.xpath("a[@class='pic']/img/@src")[0]
            house_info_list.append(house_img_url)
            # 獲取介紹房子的title
            house_title = div_xpath.xpath("div[@class='infos']/a[@class='lp-name']/h3/span/text()")[0].strip()
            house_info_list.append(house_title)
            # 獲取房子詳情信息
            try:
                house_details = div_xpath.xpath("div[@class='infos']/a[@class='huxing']")[0].xpath("string(.)").strip()
                house_details = re.sub("\s", "", house_details)
            except IndexError:
                house_details = div_xpath.xpath("div[@class='infos']/a[@class='kp-time']/text()")[0]
            house_info_list.append(house_details)
            # 獲取房子地址
            house_address = div_xpath.xpath("div[@class='infos']/a[@class='address']/span/text()")[0].strip()
            house_info_list.append(house_address)
            # 獲取房子標籤
            house_tags = ",".join(div_xpath.xpath("div[@class='infos']/a[@class='tags-wrap']/div/span/text()"))
            house_info_list.append(house_tags)
            # 獲取房子的類型
            # 有些房子它是沒有類型的
            try:
                house_type = \
                    div_xpath.xpath("div[@class='infos']/a[@class='tags-wrap']/div[@class='tag-panel']/i[2]/text()")[0]
            except IndexError:
                house_type = ""
            house_info_list.append(house_type)
            # 獲取房子是否還在售賣
            house_is_sale = div_xpath.xpath("div[@class='infos']/a[@class='tags-wrap']/div/i[1]/text()")[0]
            house_info_list.append(house_is_sale)
            # 獲取房子價格
            # 有兩種狀況,一種價格肯定,一種價格待定
            # 價格待定也有兩種,一種是周圍價格,一種就是沒有價格
            try:
                house_price = div_xpath.xpath("a[@class='favor-pos']/p[@class='price']")[0].xpath("string(.)").strip()
            except IndexError:
                try:
                    house_price = div_xpath.xpath("a[@class='favor-pos']/p[2]")[0].xpath("string(.)").strip()
                except IndexError:
                    house_price = "暫無"
            house_info_list.append(house_price)
            yield house_info_list

    def __is_exist_next_page_v1(self, response):
        '''檢測新房的當前頁面是否有下一頁'''
        xml = self.__response_to_xml(response)
        next_page_url = xml.xpath("//a[@class='next-page next-link']/@href")
        if next_page_url:
            return next_page_url[0]
        return False

    def __save_to_db(self, house_info_tuple, table_name):
        '''將數據保存在數據庫,我這裏只寫了租房,新房,二手房,這樣寫的話,那麼數據表的名字必需要對應上呀'''
        if table_name == "secondary_house":
            sql = "insert into secondary_house (house_img_url,house_title,house_details,house_address,house_total_price,house_price,house_tags) values (%s,%s,%s,%s,%s,%s,%s)"
        elif table_name == "new_house":
            sql = "insert into new_house (house_img_url,house_title,house_details,house_address,house_tags,house_type,house_is_sale,house_price) values (%s,%s,%s,%s,%s,%s,%s,%s)"

        else:
            sql = "insert into zu_house (house_img_url,house_title,house_details,house_address,house_tags,house_price) values (%s,%s,%s,%s,%s,%s)"
        self.conn.cursor.execute(sql, house_info_tuple)
        self.conn.commit()

    def __get_proxies(self):
        '''從代理池獲取代理'''
        if not self.proxies:
            self.__init_proxies()
        while True:
            # 這裏字段較少,並且全部的數據我都須要,因此用 "*"
            offset = random.randint(1, 100)
            sql = "select * from proxies ORDER BY id LIMIT %s,1 "
            row = self.proxies.cursor.execute(sql, (offset,))
            if not row:
                raise MyException(10003, "代理池錯誤")
            res = self.proxies.cursor.fetchone()
            proxies = {res["type"].lower(): "{}://{}:{}".format(res["type"].lower(), res["ip"], res["port"])}
            # 檢測代理是否可使用
            if self.__check_proxies(proxies):
                return proxies
            else:
                # 刪除不可用的代理的記錄
                del_sql = "DELETE FROM table_name where id = %s"
                self.proxies.cursor.execute(del_sql, (res["id"],))
                self.proxies.commit()

    def __check_proxies(self, proxies):
        '''檢測代理是否可使用'''
        api = "https://www.cnblogs.com/"
        try:
            res = requests.get(api, headers={"user-Agent": ua.random}, proxies=proxies, timeout=3)
            if res.status_code == 200:
                return True
            else:
                return False
        except Exception:
            return False

    def __init_proxies(self):
        self.proxies = pymysql.connect(
            host="127.0.0.1",
            port=3306,
            user="root",
            db="proxies"
        )
        self.proxies.cursor = self.proxies.cursor(cursor=pymysql.cursors.DictCursor)

    def __start_secondary_spider(self, url, city):
        '''處理二手房的爬蟲'''
        secondary_house_table_name = "secondary_house"
        headers = self.session.headers
        page_num = 1
        while True:
            time.sleep(3)
            print("正在爬取 {} 第 {} 頁...".format(city, page_num))
            response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)

            # 獲取當前頁面的須要的數據,保存在數據庫
            print("正在寫入數據庫...")

            for house_info_tuple in self.__get_html_information_v2(response):
                # 額,這裏我是把全部的二手房信息,保存在一張表中,當時忘記加city這個字段了,若是你要寫的話,最好加上city這個字段
                # 之後方便對數據庫中的數據進行處理的話,就相對來講好不少
                self.__save_to_db(house_info_tuple, secondary_house_table_name)

            # 測試了一下,二手房數據最多50頁,可是最好仍是根據下一頁去獲取到下一頁的數據
            next_page_url = self.__is_exist_next_page(response)
            if not next_page_url:
                raise MyException(10000, "{}二手房--數據爬取完畢...".format(city))
            url = next_page_url
            page_num += 1

    def __start_new_house_spider(self, url, city):
        '''處理新房的爬蟲'''
        new_house_table_name = "new_house"
        headers = self.session.headers
        page_num = 1
        while True:
            time.sleep(3)
            print("正在爬取 {} 第 {} 頁...".format(city, page_num))
            response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)
            print("正在寫入數據庫...")
            for house_info_list in self.__get_html_information_v1(response):
                self.__save_to_db(house_info_list, new_house_table_name)
            next_page_url = self.__is_exist_next_page_v1(response)
            if not next_page_url:
                raise MyException(10000, "{}新房--數據爬取完畢...".format(city))
            url = next_page_url
            page_num += 1

    def __get_html_information_v3(self, response):
        '''獲取租房頁面的房子信息'''
        xml = self.__response_to_xml(response)
        if xml.xpath("//*[@id='verify_page']"):
            pass

        div_xpath_list = xml.xpath("//div[@class='zu-itemmod']")
        for div_xpath in div_xpath_list:
            house_info_list = []

            house_img_url = div_xpath.xpath("a/img/@src")[0]
            house_info_list.append(house_img_url)

            house_title = div_xpath.xpath("div[@class='zu-info']/h3/a/text()")[0].strip()
            house_info_list.append(house_title)

            house_details = div_xpath.xpath("div[@class='zu-info']/p[@class='details-item tag']")[0].xpath(
                "string(.)").strip().split("")[0]
            house_details = re.sub("\s", "", house_details)
            house_info_list.append(house_details)

            house_address = div_xpath.xpath("div[@class='zu-info']/address[@class='details-item']")[0].xpath(
                "string(.)").strip().replace("\xa0", "")
            house_address = re.sub("\s", "", house_address)
            house_info_list.append(house_address)

            house_tags = ",".join(div_xpath.xpath("div[@class='zu-info']/p[@class='details-item bot-tag']/span/text()"))
            house_info_list.append(house_tags)

            house_price = div_xpath.xpath("div[@class='zu-side']/p")[0].xpath("string(.)").strip()
            house_info_list.append(house_price)

            yield house_info_list

    def __is_exist_next_page_v3(self, response):
        '''判斷租房頁面是否有下一頁'''
        xml = self.__response_to_xml(response)
        next_page_url = xml.xpath("//a[@class='aNxt']/@href")
        if next_page_url:
            return next_page_url[0]
        return False

    def __start_zu_house_spider(self, url, city):
        '''爬取租房'''
        zu_house_table_name = "zu_house"
        headers = self.session.headers
        page_num = 1
        while True:
            time.sleep(3)
            print("正在爬取 {} 第 {} 頁...".format(city, page_num))
            try:
                response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)
            except Timeout:
                response = self.session.get(url, headers=headers, proxies=self.__get_proxies(), timeout=10)
            print("正在寫入數據庫...")
            for house_info_list in self.__get_html_information_v3(response):
                self.__save_to_db(house_info_list, zu_house_table_name)
            next_page_url = self.__is_exist_next_page_v3(response)
            if not next_page_url:
                raise MyException(10000, "{}租房--數據爬取完畢...".format(city))
            url = next_page_url
            page_num += 1

    def spider_zufang(self, city: str = "北京", allow_all: bool = False):
        '''爬取租房信息'''
        while True:
            format_city = self.city_dict.pop(city)
            assert bool(format_city) is True, "請輸入正確的地區"
            start_url = "https://{}.zu.anjuke.com/".format(format_city)
            try:
                self.__start_zu_house_spider(start_url, city)
            except MyException as e:
                if e.status == 10000:
                    print(e.msg)
                    if allow_all:
                        try:
                            city = list(self.city_dict.keys()).pop(0)
                        except IndexError:
                            print("所有爬取完畢")
                            return
                    else:
                        return

    def spider_new_house(self, city: str = "北京", allow_all: bool = False):
        '''爬取新房'''
        while True:
            format_city = self.city_dict.pop(city)
            assert bool(format_city) is True, "請輸入正確的地區"
            start_url = "https://{}.fang.anjuke.com/".format(format_city)
            try:
                self.__start_new_house_spider(start_url, city)
            except MyException as e:
                if e.status == 10000:
                    print(e.msg)
                    if allow_all:
                        try:
                            city = list(self.city_dict.keys()).pop(0)
                        except IndexError:
                            print("所有爬取完畢")
                            return
                    else:
                        return

    def spider_secondary(self, city: str = "北京", allow_all: bool = False):
        '''
        :param city: 默認是北京
        :return:
        '''
        # 這裏直接是要bj也是能夠的,他會幫咱們重定向beijing
        while True:
            format_city = self.city_dict.pop(city)
            assert bool(format_city) is True, "請輸入正確的地區"
            start_url = "https://{}.anjuke.com/sale/".format(format_city)
            try:
                self.__start_secondary_spider(start_url, city)
            except MyException as e:
                if e.status == 10000:
                    print(e.msg)
                    if allow_all:
                        try:
                            city = list(self.city_dict.keys()).pop(0)
                        except IndexError:
                            print("所有爬取完畢")
                            return
                    else:
                        return

    def __del__(self):
        self.conn.close()
        if self.proxies:
            self.proxies.close()

    def test(self):
        '''測試bug專用方法'''
        res = self.session.get("https://al.zu.anjuke.com/", headers=self.session.headers)
        n = 1
        for i in self.__get_html_information_v3(res):
            print(n)
            print(i)
            n += 1


if __name__ == '__main__':
    anjuke = AnKeJu()
    # anjuke.spider_secondary(allow_all=True)
    # anjuke.spider_new_house(allow_all=True)
    # anjuke.spider_zufang(allow_all=True)
    # anjuke.test()

補上數據庫獲取到的數據。。建立數據庫的時候,最好添加一個city的字段,要否則太亂了html

相關文章
相關標籤/搜索