Scrapy 是Python開發的一個快速、高層次的屏幕抓取和web抓取框架,用於抓取web站點並從頁面中提取結構化的數據。Scrapy用途普遍,能夠用於數據挖掘、監測和自動化測試。html
request: scrapy.http.request.Request # HtmlResponse 繼承自 TextResponse 繼承自 HtmlResponse response: scrapy.http.response.html.HtmlResponse response: scrapy.http.response.text.TextResponse response: scrapy.http.response.Response
for k in self.settings: print(k, self.settings.get(k)) if isinstance(self.settings.get(k), scrapy.settings.BaseSettings): for kk in self.settings.get(k): print('\t', kk, self.settings.get(k).get(kk))
(How to get the number of requests in queue in scrapy?)python
# scrapy.core.scheduler.Scheduler # spider len(self.crawler.engine.slot.scheduler) # pipeline len(spider.crawler.engine.slot.scheduler)
# scrapy.core.engine.Slot.inprogress 就是個 set # spider len(self.crawler.engine.slot.inprogress) # pipeline len(spider.crawler.engine.slot.inprogress)
(How to get the pipeline object in Scrapy spider)mysql
# Pipline class MongoDBPipeline(object): def __init__(self, mongodb_db=None, mongodb_collection=None): self.connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) def get_date(self): pass def open_spider(self, spider): spider.myPipeline = self def process_item(self, item, spider): pass # spider class MySpider(Spider): def __init__(self): self.myPipeline = None def start_requests(self): # 可直接存儲數據 self.mysqlPipeline.process_item(item, self) def parse(self, response): self.myPipeline.get_date()
(Multiple cookie sessions per spider)git
# Scrapy經過使用 cookiejar Request meta key來支持單spider追蹤多cookie session。 # 默認狀況下其使用一個cookie jar(session),不過您能夠傳遞一個標示符來使用多個。 for i, url in enumerate(urls): yield scrapy.Request("http://www.example.com", meta={'cookiejar': i}, callback=self.parse_page) # 須要注意的是 cookiejar meta key不是」黏性的(sticky)」。 您須要在以後的request請求中接着傳遞。 def parse_page(self, response): # do some processing return scrapy.Request("http://www.example.com/otherpage", meta={'cookiejar': response.meta['cookiejar']}, callback=self.parse_other_page)
Closing spider (finished)github
# scrapy.core.engine.ExecutionEngine def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True # spider 裏面打印條件 self.logger.debug('engine.scraper.slot.is_idle: %s' % repr(self.crawler.engine.scraper.slot.is_idle())) self.logger.debug('\tengine.scraper.slot.active: %s' % repr(self.crawler.engine.scraper.slot.active)) self.logger.debug('\tengine.scraper.slot.queue: %s' % repr(self.crawler.engine.scraper.slot.queue)) self.logger.debug('engine.downloader.active: %s' % repr(self.crawler.engine.downloader.active)) self.logger.debug('engine.slot.start_requests: %s' % repr(self.crawler.engine.slot.start_requests)) self.logger.debug('engine.slot.scheduler.has_pending_requests: %s' % repr(self.crawler.engine.slot.scheduler.has_pending_requests()))
(Scrapy: How to manually insert a request from a spider_idle event callback?)web
class FooSpider(BaseSpider): yet = False @classmethod def from_crawler(cls, crawler, *args, **kwargs): from_crawler = super(FooSpider, cls).from_crawler spider = from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.idle, signal=scrapy.signals.spider_idle) return spider def idle(self): if not self.yet: self.crawler.engine.crawl(self.create_request(), self) self.yet = True
默認值: False
non-200 response | timeout | |
---|---|---|
True | callback | errback |
False | errback | errback |
walker 看起來新圖只是舊圖的細化,無實質性差別。sql
本文出自 walker snapshot