Scrapy爬蟲：鏈家全國各省城市房屋數據批量爬取，別再爲房屋發愁！

:點擊上方[Python爬蟲數據分析挖掘]→右上角[...]→[設爲星標⭐]
html

文章目錄python

一、前言mysql
二、基本環境搭建web
三、代碼註釋分析sql
三、圖片輔助分析數據庫
四、完整代碼swift
五、運行結果ruby

一、前言微信

本文爬取的是鏈家的二手房信息，相信個位小夥伴看完後必定能本身動手爬取鏈家的其餘模塊，好比：租房、新房等等模塊房屋數據。

話很少說，來到鏈家首頁，點擊北京cookie

來到以下頁面，這裏有全國各個各個省份城市，並且點擊某個城市會跳轉到以該城市的爲定位的頁面

點擊二手房，來到二手房頁面，能夠發現連接地址只是在原先的URL上拼接了 /ershoufang/,因此咱們以後也能夠直接拼接

但注意，如下這種咱們不須要的須要排除

多頁爬取，規律以下，多的也不用我說了，你們都能看出來

二、基本環境搭建

創建數據庫

建表語句

CREATE TABLE `lianjia` ( `id` int(11) NOT NULL AUTO_INCREMENT, `city` varchar(100) DEFAULT NULL, `money` varchar(100) DEFAULT NULL, `address` varchar(100) DEFAULT NULL, `house_pattern` varchar(100) DEFAULT NULL, `house_size` varchar(100) DEFAULT NULL, `house_degree` varchar(100) DEFAULT NULL, `house_floor` varchar(100) DEFAULT NULL, `price` varchar(50) DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=212 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

建立scrapy項目

start.py

from scrapy import cmdline
cmdline.execute("scrapy crawl lianjia".split())

三、代碼註釋分析

lianjia.py

# -*- coding: utf-8 -*-import scrapyimport timefrom Lianjia.items import LianjiaItem

class LianjiaSpider(scrapy.Spider): name = 'lianjia' allowed_domains = ['lianjia.com'] #擁有各個省份城市的URL start_urls = ['https://www.lianjia.com/city/']
 def parse(self, response): #參考圖1，找到class值爲city_list_ul的ul標籤，在獲取其下的全部li標籤 ul = response.xpath("//ul[@class='city_list_ul']/li")  #遍歷ul，每一個省份表明一個li標籤 for li in ul: #參考圖2，獲取每一個省份下的全部城市的li標籤 data_ul = li.xpath(".//ul/li")  #遍歷獲得每一個城市 for li_data in data_ul: #參考圖3，獲取每一個城市的URL和名稱 city = li_data.xpath(".//a/text()").get() #拼接成爲二手房連接 page_url = li_data.xpath(".//a/@href").get() + "/ershoufang/"  #多頁爬取 for i in range(3): url = page_url + "pg" + str(i+1) print(url) yield scrapy.Request(url=url,callback=self.pageData,meta={"info":city})
 def pageData(self,response): print("="*50) #獲取傳過來的城市名稱 city = response.meta.get("info")  #參考圖4，找到class值爲sellListContent的ul標籤，在獲取其下的全部li標籤 detail_li = response.xpath("//ul[@class='sellListContent']/li")  #遍歷 for page_li in detail_li: #參考圖5，獲取class值判斷排除多餘的廣告 if page_li.xpath("@class").get() == "list_app_daoliu": continue  #參考圖6，獲取房屋總價 money = page_li.xpath(".//div[@class='totalPrice']/span/text()").get() money = str(money) + "萬"  #參考圖7 address = page_li.xpath(".//div[@class='positionInfo']/a/text()").get()  #參考圖8，獲取到房屋的所有數據，進行分割 house_data = page_li.xpath(".//div[@class='houseInfo']/text()").get().split("|")
 #房屋格局 house_pattern = house_data[0]  #面積大小 house_size = house_data[1].strip() #裝修程度 house_degree = house_data[3].strip() #樓層 house_floor = house_data[4].strip() #單價，參考圖9 price = page_li.xpath(".//div[@class='unitPrice']/span/text()").get().replace("單價","")  time.sleep(0.5) item = LianjiaItem(city=city,money=money,address=address,house_pattern=house_pattern,house_size=house_size,house_degree=house_degree,house_floor=house_floor,price=price) yield item

三、圖片輔助分析

圖1

圖2

圖3

圖4

圖5

圖6

圖7

圖8

圖9

四、完整代碼

lianjia.py

# -*- coding: utf-8 -*-import scrapyimport timefrom Lianjia.items import LianjiaItem

class LianjiaSpider(scrapy.Spider): name = 'lianjia' allowed_domains = ['lianjia.com'] start_urls = ['https://www.lianjia.com/city/']
 def parse(self, response): ul = response.xpath("//ul[@class='city_list_ul']/li") for li in ul: data_ul = li.xpath(".//ul/li")
 for li_data in data_ul: city = li_data.xpath(".//a/text()").get() page_url = li_data.xpath(".//a/@href").get() + "/ershoufang/" for i in range(3): url = page_url + "pg" + str(i+1) print(url) yield scrapy.Request(url=url,callback=self.pageData,meta={"info":city})
 def pageData(self,response): print("="*50) city = response.meta.get("info") detail_li = response.xpath("//ul[@class='sellListContent']/li") for page_li in detail_li: if page_li.xpath("@class").get() == "list_app_daoliu": continue money = page_li.xpath(".//div[@class='totalPrice']/span/text()").get() money = str(money) + "萬" address = page_li.xpath(".//div[@class='positionInfo']/a/text()").get()
 #獲取到房屋的所有數據，進行分割 house_data = page_li.xpath(".//div[@class='houseInfo']/text()").get().split("|")
 #房屋格局 house_pattern = house_data[0] #面積大小 house_size = house_data[1].strip() #裝修程度 house_degree = house_data[3].strip() #樓層 house_floor = house_data[4].strip() #單價 price = page_li.xpath(".//div[@class='unitPrice']/span/text()").get().replace("單價","") time.sleep(0.5) item = LianjiaItem(city=city,money=money,address=address,house_pattern=house_pattern,house_size=house_size,house_degree=house_degree,house_floor=house_floor,price=price) yield item

items.py

# -*- coding: utf-8 -*-import scrapy

class LianjiaItem(scrapy.Item): #城市 city = scrapy.Field() #總價 money = scrapy.Field() #地址 address = scrapy.Field() # 房屋格局 house_pattern = scrapy.Field() # 面積大小 house_size = scrapy.Field() # 裝修程度 house_degree = scrapy.Field() # 樓層 house_floor = scrapy.Field() # 單價 price = scrapy.Field()

pipelines.py

import pymysql

class LianjiaPipeline: def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', #數據庫帳號 'password': 'root', #數據庫密碼 'database': 'lianjia', #數據庫名稱 'charset': 'utf8' } #初始化數據庫鏈接 self.conn = pymysql.connect(**dbparams) self.cursor = self.conn.cursor() self._sql = None

 def process_item(self, item, spider): #執行sql self.cursor.execute(self.sql,(item['city'],item['money'],item['address'],item['house_pattern'],item['house_size'],item['house_degree'] ,item['house_floor'],item['price'])) self.conn.commit() #提交 return item
 @property def sql(self): if not self._sql: #數據庫插入語句 self._sql = """ insert into lianjia(id,city,money,address,house_pattern,house_size,house_degree,house_floor,price) values(null,%s,%s,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql

settings.py

# -*- coding: utf-8 -*-
BOT_NAME = 'Lianjia'
SPIDER_MODULES = ['Lianjia.spiders']NEWSPIDER_MODULE = 'Lianjia.spiders'
LOG_LEVEL="ERROR"
# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'Lianjia (+http://www.yourdomain.com)'
# Obey robots.txt rulesROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False
# Override the default request headers:DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63"}
# Enable or disable spider middlewares# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'Lianjia.middlewares.LianjiaSpiderMiddleware': 543,#}
# Enable or disable downloader middlewares# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'Lianjia.middlewares.LianjiaDownloaderMiddleware': 543,#}
# Enable or disable extensions# See https://docs.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}
# Configure item pipelines# See https://docs.scrapy.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { 'Lianjia.pipelines.LianjiaPipeline': 300,}
# Enable and configure the AutoThrottle extension (disabled by default)# See https://docs.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

五、運行結果

所有數據遠遠大於518條，我爬取一會就停下來了，這裏只是個演示。

- END -

【各類爬蟲源碼獲取方式】

識別文末二維碼，回覆：爬蟲源碼

歡迎關注公衆號：Python爬蟲數據分析挖掘，方便及時閱讀最新文章

記錄學習python的點點滴滴；

回覆【開源源碼】免費獲取更多開源項目源碼；

本文分享自微信公衆號 - Python爬蟲數據分析挖掘（zyzx3344）。
若有侵權，請聯繫 support@oschina.cn 刪除。
本文參與「OSC源創計劃」，歡迎正在閱讀的你也加入，一塊兒分享。