練習準備
Win10系統(已經安裝配置好scrapy);Python2.7;http://hr.tencent.com/position.php?&start=0#a(社會招聘網頁)php
引擎
和Spider
中間通訊
的功能組件(好比進入Spider
的Responses;和從Spider
出去的Requests)import scrapy class TencentItem(scrapy.Item): # define the fields for your item here like: #職位名稱 posName = scrapy.Field() #職位連接 posLink = scrapy.Field() #職位分類 posClass = scrapy.Field() #招聘人數 posPerson = scrapy.Field() #招聘城市 posCity = scrapy.Field() #招聘時間 posTime = scrapy.Field()b,自定義Spiders(在spiders文件夾新建一個.py文件,用於自定義的爬蟲)
# -*- coding: utf-8 -*- import scrapy from tencent.items import TencentItem #將上面寫的容器引入,方便存數據 class tencentSpiders(scrapy.Spider): #設置爬蟲的名字————框架固定寫法name name="tencent" #設置容許爬出的網站範圍————框架固定寫法allowed_domains allowed_domains = ["hr.tencent.com"] #設置開始爬取的網站 #因爲本次爬取的網站存在多頁,各頁之間經過末尾的數字進行控制的 #所以,本次使用循環發送請求,改變某位數字進行頁碼控制 #url地址:http://hr.tencent.com/position.php?&start=0 url = "http://hr.tencent.com/position.php?&start=" #設置一個變量拼接在url後面 offset = 0 #則開始爬取的頁面爲————框架固定寫法start_urls start_urls=[url+str(offset)] #設置parse函數處理請求返回的數據————固定寫法(函數名固定,內容本身寫) def parse(self,response): #根標籤: node_list=response.xpath("//tr[@class='even']|//tr[@class='odd']") #遍歷根節點 for each in node_list: #c建立容器 item=TencentItem() #職位,相應加入容器裏 item["posName"]=each.xpath("./td[1]/a/text()").extract()[0] #職位連接,extract()轉化爲Unicode字符串 item["posLink"]=each.xpath("./td[1]/a/@href").extract()[0] #職位類別,因爲xpath返回的是列表,[0]是爲了取第一個 item["posClass"]=each.xpath("./td[2]/text()").extract()[0] #招聘人數: item["posPerson"]=each.xpath("./td[3]/text()").extract()[0] #招聘地點: item["posCity"]=each.xpath("./td[4]/text()").extract()[0] #發佈時間: item["posTime"]=each.xpath("./td[5]/text()").extract()[0] #將數據返回給pipeline yield item #繼續請求下一頁的數據,每一頁數據末位值是加10,目前總共到2510 if self.offset < 2510: self.offset += 10 #頁碼調整以後從新,向引擎發送請求,回調parse函數 yield scrapy.Request(self.url+str(self.offset),callback=self.parse)②,第二種方法:新建一個CrawlSpider爬蟲類進行處理,以下:(這兩種方法的使用僅限於此處設置不同,其餘幾個方面的處理是同樣的。CrawlSpider內部設置了爬取內容的規則,咱們只需設置咱們想要的連接規則,就能夠深度爬取了)
# -*- coding: utf-8 -*- import scrapy #導入連接規則匹配類,用來提取符合要求的連接 from scrapy.linkextractors import LinkExtractor #導入crawlSpider爬蟲類和Rule規則類 from scrapy.spiders import CrawlSpider, Rule from hrtencent.items import TencentItem class TencentSpider(CrawlSpider): name = 'tencent' allowed_domains = ['hr.tencent.com'] #設置開始爬取的網頁 start_urls = ['http://hr.tencent.com/position.php?&start=0#a'] #設置提取返回數據Respouse裏的所需連接的要求規格,allow=(這裏面是正則匹配) link_list=LinkExtractor(allow=("start=\d")) #設置得到的連接,依法發送請求,而且跟進,調用自定義函數處理 rules = ( Rule(link_list , callback="parse_item", follow=True), ) #此函數爲自定義函數,但名字不能是parse,會與系統衝突 def parse_item(self, response): #根標籤: node_list=response.xpath("//tr[@class='even']|//tr[@class='odd']") #遍歷根節點 for each in node_list: #c建立容器 item=TencentItem() #職位,相應加入容器裏 item["posName"]=each.xpath("./td[1]/a/text()").extract()[0] #職位連接,extract()轉化爲Unicode字符串 item["posLink"]=each.xpath("./td[1]/a/@href").extract()[0] #職位類別,因爲xpath返回的是列表,[0]是爲了取第一個 item["posClass"]=each.xpath("./td[2]/text()").extract()[0] #招聘人數: item["posPerson"]=each.xpath("./td[3]/text()").extract()[0] #招聘地點: item["posCity"]=each.xpath("./td[4]/text()").extract()[0] #發佈時間: item["posTime"]=each.xpath("./td[5]/text()").extract()[0] #將數據返回給pipeline yield item #與上一種方法比較,這裏就不須要自定義不一樣連接的規則調整,再從新發送請求的。 #由於在上面的Rule裏已經設置的新的規則
import json class TencentPipeline(object): #這三個函數名都是固定的寫法,名字不能改變,可是隻有第二個process_item函數是必須要的 def __init__(self): #初始化時,新建一個文件,用來保存數據 self.file = open("tenctent.json","w") def process_item(self, item, spider): #對於傳過來的item文件,轉化爲json格式存儲 text=json.dumps(dict(item),ensure_ascii=False)+"\n" self.file.write(text.encode("utf-8")) return item def close_spider(self,spider): #關閉文件通道 self.file.close()
d,設置setting,配置文件
html
# -*- coding: utf-8 -*- # Scrapy settings for tencent project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'tencent' SPIDER_MODULES = ['tencent.spiders'] NEWSPIDER_MODULE = 'tencent.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tencent (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #每發送完請求設置延遲 DOWNLOAD_DELAY = 2 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #設置請求報頭,主要是「User-Agent」,假裝瀏覽器訪問 DEFAULT_REQUEST_HEADERS = { "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'tencent.middlewares.MyCustomSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'tencent.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #設置通道文件,用於數據處理 ITEM_PIPELINES = { 'tencent.pipelines.posPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
e,運行爬蟲程序
cmd——命令窗口運行,相應項目下運行 scrapy crawl tencent
node
查看相應的爬取數據
a,對應項目目錄中
b,用記事本打開,或者複製裏面的數據在網頁搜索json解析器查看裏面的json數據
python
以上爲本人學習Scrapy中的一些記錄,web