"""CloseSpider is an extension that forces spiders to be closed after certain conditions are met. See documentation in docs/topics/extensions.rst """ class CloseSpider(object): def __init__(self, crawler): self.crawler = crawler self.close_on = { 'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'), 'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'), 'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'), 'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'), } if not any(self.close_on.values()): raise NotConfigured self.counter = defaultdict(int) if self.close_on.get('errorcount'): crawler.signals.connect(self.error_count, signal=signals.spider_error) if self.close_on.get('pagecount'): crawler.signals.connect(self.page_count, signal=signals.response_received) if self.close_on.get('timeout'): crawler.signals.connect(self.spider_opened, signal=signals.spider_opened) if self.close_on.get('itemcount'): crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(crawler) def error_count(self, failure, response, spider): self.counter['errorcount'] += 1 if self.counter['errorcount'] == self.close_on['errorcount']: self.crawler.engine.close_spider(spider, 'closespider_errorcount') def page_count(self, response, request, spider): self.counter['pagecount'] += 1 if self.counter['pagecount'] == self.close_on['pagecount']: self.crawler.engine.close_spider(spider, 'closespider_pagecount') def spider_opened(self, spider): self.task = reactor.callLater(self.close_on['timeout'], \ self.crawler.engine.close_spider, spider, \ reason='closespider_timeout') def item_scraped(self, item, spider): self.counter['itemcount'] += 1 if self.counter['itemcount'] == self.close_on['itemcount']: self.crawler.engine.close_spider(spider, 'closespider_itemcount') def spider_closed(self, spider): task = getattr(self, 'task', False) if task and task.active(): task.cancel()
1 上述代碼是一個scrapy 關閉爬蟲的一個的擴展類,從代碼中能夠看出主要是實現了timeout, itemcount, pagecount, errorcount 4種方式,所以能夠在setting中設置這4種方式,當觸發條件的時候會自動中止爬蟲react
在setting中設置 CLOSESPIDER_TIMEOUT # 指定時間退出 CLOSESPIDER_ITEMCOUNT # 生成了指定數量的item CLOSESPIDER_PAGECOUNT # 抓取了指定數量的響應 CLOSESPIDER_ERRORCOUNT # 在發生指定數量的錯誤
# 打開EXTENSIONS擴展
EXTENSIONS = {
'scrapy.extensions.closespider.CloseSpider': 500,
}scrapy
2 從CloseSpider類中能夠了解到中止爬蟲是使用了 self.crawler.engine.close_spider() 方法,所以當知足必定條件的時候咱們也能夠調用這個方法中止scrapyide
# 在Spider文件中
self.crawler.engine.close_spider(self, 錯誤信息)
# 在middlewares文件中
spider.crawler.engine.close_spider(spider, 錯誤信息)