概念:檢測網站數據跟新的狀況,爬取更新數據php
核心:去重!!!html
增量式爬蟲python
深度爬取類型的網站中須要對詳情頁的url進行記錄和檢測redis
記錄:將爬取過的詳情頁的url進行記錄保存dom
檢測:若是對某一個詳情頁的url發起請求以前先要取記錄表中進行查看,該url是否存在,存在的話覺得
着這個url已經被爬取過了。scrapy
代碼示例ide
spider.py文件網站
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from zjs_moviePro.items import ZjsMovieproItem class MovieSpider(CrawlSpider): name = 'movie' conn = Redis(host='127.0.0.1',port=6379) # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/6.html'] rules = (#/index.php/vod/show/id/6/page/2.html Rule(LinkExtractor(allow=r'id/6/page/\d+\.html'), callback='parse_item', follow=False), ) def parse_item(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('./div/div/h4/a/text()').extract_first() detail_url = 'https://www.4567tv.tv'+li.xpath('./div/div/h4/a/@href').extract_first() ex = self.conn.sadd('movie_detail_urls',detail_url) if ex == 1:#向redis的set中成功插入了detail_url print('有最新數據可爬......') item = ZjsMovieproItem() item['name'] = name yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item}) else: print('該數據已經被爬取過了!') def parse_detail(self,response): item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['desc'] = desc yield item
item中建立屬性url
import scrapy class ZjsMovieproItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() desc = scrapy.Field()
管道文件中spa
class ZjsMovieproPipeline(object): def process_item(self, item, spider): conn = spider.conn conn.lpush('movie_data',item) return item