做用:就是用於進行全站數據的爬取
- CrawlSpider就是Spider的一個子類
- 如何新建一個基於CrawlSpider的爬蟲文件 - scrapy genspider -t crawl xxx www.xxx.com
- LinkExtractor鏈接提取器:根據指定規則(正則)進行鏈接的提取 - Rule規則解析器:將連接提取器提取到的連接進行請求發送,而後對獲取的頁面數據進行 指定規則(callback)的解析 - 一個連接提取器對應惟一一個規則解析器
①.爬取抽屜網多頁數據對象php
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class ChoutiSpider(CrawlSpider): name = 'chouti' # allowed_domains = ['www.ccc.coim'] start_urls = ['https://dig.chouti.com/all/hot/recent/1'] #連接提取器:從起始url對應的頁面中提取符合規則的連接。allow=》正則 link= LinkExtractor(allow=r'/all/hot/recent/\d+') rules = ( #規則解析器:將連接提取器提取到的連接對應的頁面源碼進行指定規則的解析 Rule(link, callback='parse_item', follow=True), #follow:True 將鏈接提取器 繼續 做用到 鏈接提取器提取出來的連接 對應的頁面源碼中 #False:只提取當前頁匹配的地址 ) def parse_item(self, response): print(response)
②爬取陽光熱線 多頁及詳情頁數據,持久化存儲html
#爬蟲文件中:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunLinePro.items import SunlineproItem,ContentItem class SunSpider(CrawlSpider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] link= LinkExtractor(allow=r'type=4&page=\d+')#提取頁碼鏈接 link1 = LinkExtractor(allow=r'question/2019\d+/\d+\.shtml')#提取詳情頁的連接 rules = ( Rule(link, callback='parse_item', follow=False), Rule(link1, callback='parse_detail'), ) #解析出標題和網友名稱 def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() netFriend = tr.xpath('./td[4]/text()').extract_first() item = SunlineproItem() item['title'] = title item['netFriend'] = netFriend yield item #解析出新聞的內容 def parse_detail(self,response): content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]//text()').extract() content = ''.join(content) item = ContentItem() item['content'] = content yield item
#在 items.py 文件中:
import scrapy class SunlineproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() netFriend = scrapy.Field() class ContentItem(scrapy.Item): # define the fields for your item here like: content = scrapy.Field()
#在管道文件中:
class SunlineproPipeline(object): def process_item(self, item, spider): #接收到的item到底是什麼類型的 if item.__class__.__name__ =='SunlineproItem': print(item['title'],item['netFriend']) else: print(item['content']) return item
- 概念:能夠將一組程序執行在多臺機器上(分佈式機羣),使其進行數據的分佈爬取。
- 原生的scrapy框架是否能夠實現分佈式? - 不能夠? - 調度器不能夠被分佈式機羣共享 - 管道不能夠被分佈式機羣共享
- 藉助scrapy-redis這個模塊幫助scrapy實現分佈式 - scrapy-redis做用: - 能夠提供能夠被共享的管道和調度器
安裝模塊: - pip install scrapy-redis
- 分佈式的實現流程: - 導包:from scrapy_redis.spiders import RedisCrawlSpider
- 修改爬蟲文件的代碼: - 將當前爬蟲類的父類修改爲RedisCrawlSpider - 將start_urls刪除 - 添加一個新屬性redis_key = 'ts':能夠被共享的調度器中的隊列名稱 - 設置管道: ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 400 # 'scrapyRedisPro.pipelines.ScrapyredisproPipeline': 300, }
- 設置調度器: # 增長了一個去重容器類的配置, 做用使用Redis的set集合來存儲請求的指紋數據, 從而實現請求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis組件本身的調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 配置調度器是否要持久化, 也就是當爬蟲結束了, 要不要清空Redis中請求隊列和去重指紋的set。若是是True, 就表示要持久化存儲, 就不清空數據, 不然清空數據 SCHEDULER_PERSIST = True - 指定redis服務器 REDIS_HOST = '192.168.12.154' REDIS_PORT = 6379 - 配置redis: 修改Redis的配置文件:redis.windows.conf - #bind 127.0.0.1 - protected-mode no
- 攜帶配置文件啓動redis服務 - redis-server ./redis.windows.conf
- 啓動redis客戶端 - 執行工程:scrapy runspider xxx.py (執行爬蟲py文件) - 手動將起始url扔入調度器的隊列中(redis-cli):lpush ts www.xxx.com - redis-cli: items:xxx
# 在爬蟲文件中
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapyRedisPro.items import ScrapyredisproItem from scrapy_redis.spiders import RedisCrawlSpider from scrapy_redis.spiders import RedisSpider class TestSpider(RedisCrawlSpider): name = 'test' # allowed_domains = ['www.xxx.com'] # start_urls = ['http://www.xxx.com/'] redis_key = 'ts' #能夠被共享的調度器中的隊列名稱 rules = ( Rule(LinkExtractor(allow=r'type=4&page=\d+'), callback='parse_item', follow=True), ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() netFriend = tr.xpath('./td[4]/text()').extract_first() item = ScrapyredisproItem() item['title'] = title item['net'] = netFriend yield item #提交的item必須保證提交到能夠被共享的管道中
#在items.py 文件中:
import scrapy class ScrapyredisproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() net = scrapy.Field()
#在setting.py文件中:
# -*- coding: utf-8 -*- # Scrapy settings for scrapyRedisPro project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapyRedisPro' SPIDER_MODULES = ['scrapyRedisPro.spiders'] NEWSPIDER_MODULE = 'scrapyRedisPro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scrapyRedisPro (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scrapyRedisPro.middlewares.ScrapyredisproSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'scrapyRedisPro.middlewares.ScrapyredisproDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 400 # 'scrapyRedisPro.pipelines.ScrapyredisproPipeline': 300, } # 增長了一個去重容器類的配置, 做用使用Redis的set集合來存儲請求的指紋數據, 從而實現請求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis組件本身的調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 配置調度器是否要持久化, 也就是當爬蟲結束了, 要不要清空Redis中請求隊列和去重指紋的set。若是是True, 就表示要持久化存儲, 就不清空數據, 不然清空數據 SCHEDULER_PERSIST = True REDIS_HOST = '192.168.12.154' REDIS_PORT = 6379 # 網速號可開多個線程(影響不大) CONCURRENT_REQUESTS = 2
概念:經過爬蟲程序監測某網站數據更新的狀況,以即可以爬取到該網站更新出的新數據。
如何進行增量式的爬取工做:
在發送請求以前判斷這個URL是否是以前爬取過
在解析內容後判斷這部份內容是否是以前爬取過
寫入存儲介質時判斷內容是否是已經在介質中存在
分析:
不難發現,其實增量爬取的核心是去重, 至於去重的操做在哪一個步驟起做用,只能說各有利弊。在我看來,
前兩種思路須要根據實際狀況取一個(也可能都用)。第一種思路適合不斷有新頁面出現的網站,好比說小說的新章節,
天天的最新新聞等等;第二種思路則適合頁面內容會更新的網站。第三個思路是至關因而最後的一道防線。
這樣作能夠最大程度上達到去重的目的。
去重方法:
將爬取過程當中產生的url進行存儲,存儲在redis的set中。當下次進行數據爬取時,
首先對即將要發起的請求對應的url在存儲的url的set中作判斷,若是存在則不進行請求,不然才進行請求。
對爬取到的網頁內容進行惟一標識的制定,而後將該惟一表示存儲至redis的set中。當下次爬取到網頁數據的時候,
在進行持久化存儲以前,首先能夠先判斷該數據的惟一標識在redis的set中是否存在,在決定是否進行持久化存儲。
- 概念:監測網上數據更新的狀況。
1.對url去重
- 利用 redis中的 sadd存儲類型
2.對數據去重
- 數據指紋
#在爬蟲文件中:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from moviePro.items import MovieproItem class MovieSpider(CrawlSpider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/frim/index1.html'] link = LinkExtractor(allow=r'/frim/index1-\d+.html') rules = ( Rule(link, callback='parse_item', follow=False), ) conn = Redis(host='127.0.0.1',port=6379) #解析電影的名稱和詳情頁的url def parse_item(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: title = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first() item = MovieproItem() item['title'] = title #判斷該詳情頁的url是否進行請求發送 ex = self.conn.sadd('movie_detail_urls',detail_url) if ex == 1:#說明detail_url不存在於redis的set中 print('已有最新數據更新,請爬......') yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item}) else: print('暫無新數據的更新!!!') def parse_detail(self,response): item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['desc'] = desc yield item
#在items.py 文件中
import scrapy class MovieproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() desc = scrapy.Field()
#在管道文件中:
class MovieproPipeline(object): def process_item(self, item, spider): dic = { 'title':item['title'], 'desc':item['desc'] } conn = spider.conn #存儲 conn.lpush('movie_data',dic) return item
用hashlib.sha256生成惟一的 數據指紋
存放在redis中的 sadd數據類型中
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from incrementByDataPro.items import IncrementbydataproItem from redis import Redis import hashlib class QiubaiSpider(CrawlSpider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] rules = ( Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True), Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True), ) #建立redis連接對象 conn = Redis(host='127.0.0.1',port=6379) def parse_item(self, response): div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: item = IncrementbydataproItem() item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first() item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first() #將解析到的數據值生成一個惟一的標識進行redis存儲 source = item['author']+item['content'] source_id = hashlib.sha256(source.encode()).hexdigest() #將解析內容的惟一表示存儲到redis的data_id中 ex = self.conn.sadd('data_id',source_id) if ex == 1: print('該條數據沒有爬取過,能夠爬取......') yield item else: print('該條數據已經爬取過了,不須要再次爬取了!!!')
# 在管道文件中:
from redis import Redis class IncrementbydataproPipeline(object): conn = None def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6379) def process_item(self, item, spider): dic = { 'author': item['author'], 'content': item['content'] } # print(dic) self.conn.lpush('qiubaiData', dic) return item