增量式爬蟲經過爬蟲程序監測某網站數據更新的狀況,以即可以爬取到該網站更新出的新數據。增量式爬蟲核心在於跳過以前已經爬去過的數據,也就是實現請求去重!php
去重方法html
增量式爬蟲案例redis
4567電影網電影及描述信息(http://www.4567kan.com/index.php/vod/show/id/5.html)數據庫
1.建立爬蟲項目(基於CrawlSpider類)dom
scrapy startproject zls_movieProscrapy
cd zls_movieProide
scrapy genspider -t crawl zls_movieTest www.xxx.com網站
2.編寫爬蟲文件zls_movieTest.py ui
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from redis import Redis 6 from zls_moviePro.items import ZlsMovieproItem 7 8 #獲取電影名稱及對應的描述 9 class ZlsMovietestSpider(CrawlSpider): 10 #鏈接redis數據庫 11 conn = Redis(host='127.0.0.1', port=6379) 12 name = 'zls_movieTest' 13 # allowed_domains = ['www.xxx.com'] 14 start_urls = ['http://www.4567kan.com/index.php/vod/show/id/5.html'] 15 16 rules = ( 17 Rule(LinkExtractor(allow=r'index\.php/vod/show/id/5/page/\d+.html'), callback='parse_item', follow=True), 18 ) 19 20 def parse_item(self, response): 21 22 li_list=response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') 23 for li in li_list: 24 item = ZlsMovieproItem() 25 item['name'] = li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/text()').extract_first() 26 url='http://www.4567kan.com'+li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/@href').extract_first() 27 #經電影詳情的url存在redis數據庫的集合中,能夠去重進行判斷是否爬取過 28 ex = self.conn.sadd('movie_detail_urls', url)#添加成功返回1,不然爲0 29 if ex==1: 30 print('數據有更新,正在下載...') 31 yield scrapy.Request(url=url, callback=self.parse_detail, meta={'item': item}) 32 else: 33 print('暫無數據更新!') 34 35 def parse_detail(self, response): 36 movie_desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() 37 item = response.meta['item'] 38 item['desc'] = movie_desc 39 yield item
3.items.py字段屬性定義url
1 import scrapy 2 3 4 class ZlsMovieproItem(scrapy.Item): 5 # define the fields for your item here like: 6 name = scrapy.Field() 7 desc=scrapy.Field() 8 pass
4.pipelines.py管道配置
1 class ZlsMovieproPipeline(object): 2 def process_item(self, item, spider): 3 spider.conn.lpush('movie_data',item)#spider爲爬蟲文件類的實例化對象,將item字典數據存儲在redis數據庫的movie_data對應的列表中 4 return item
5.settings.py配置文件
1 #UA假裝 2 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" 3 #robots協議 4 ROBOTSTXT_OBEY = False 5 #日誌輸出等級 6 LOG_LEVEL="ERROR" 7 8 #開啓管道 9 ITEM_PIPELINES = { 10 'zls_moviePro.pipelines.ZlsMovieproPipeline': 300, 11 }
6.啓動爬蟲項目
scrapy crawl zls_movieTest