# 源碼位置scrapy.core.engine.ExecutionEngine
class ExecutionEngine(Object):
def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled,
request=request, spider=spider)
這裏調用scheduler的enqueue_request方法作判斷,具體見2
if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signal=signals.request_dropped,
request=request, spider=spider)
複製代碼
# 源碼位置 scrapy.core.scheduler.Scheduler
class Scheduler(object):
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
logunser=False, stats=None, pqclass=None):
# dupefilter 爲具體過濾器,見3.
self.df = dupefilter
self.dqdir = self._dqdir(jobdir)
self.pqclass = pqclass
self.dqclass = dqclass
self.mqclass = mqclass
self.logunser = logunser
self.stats = stats
...
...
...
def enqueue_request(self, request):
# self.df.request_seen 爲過濾器中具體執行過濾的方法邏輯
# 若是request設置了非不過濾(即過濾,雙重否認表確定)而且該request通過判斷之後的確須要過濾,則打印任職並返回False
if not request.dont_filter and self.df.request_seen(request):
self.df.log(request, self.spider)
return False
if self.stats:
self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
self.queue.push(request)
return True
複製代碼
scrapy/dupefilters.py
裏# 去重啓基類,定義了去重器須要實現的方法
class BaseDupeFilter(object):
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
return False
def open(self): # can return deferred
pass
def close(self, reason): # can return a deferred
pass
def log(self, request, spider): # log that a request has been filtered
pass
# scrapy中默認的去重器
class RFPDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None, debug=False):
self.file = None
# 一個指紋集合,利用到了set的特性,不重複
self.fingerprints = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)
# 這裏會判斷是否設置了jobdir,若是設置了,則會將去重結合寫入到jobdir目錄,具體見《scrapy進階開發(二):暫停與重啓》一文
if path:
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
self.file.seek(0)
self.fingerprints.update(x.rstrip() for x in self.file)
@classmethod
def from_settings(cls, settings):
# settings中將DUPEFILTER_DEBUG設置爲true能夠開啓過濾debug信息的打印
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(job_dir(settings), debug)
def request_seen(self, request):
# 爲request生成一個指紋
fp = self.request_fingerprint(request)
# 判斷當前指紋是否在集合中
if fp in self.fingerprints:
# 若是在返回True表明當前request已經被處理過應該過濾掉
return True
# 不然添加到set中
self.fingerprints.add(fp)
# 若是jobdir文件存在,則寫入
if self.file:
self.file.write(fp + os.linesep)
# request_fingerprint方法在scrapy.utils.request裏,
# 使用了sha1算法爲每個request生成一個固定長度的hash值
def request_fingerprint(self, request):
return request_fingerprint(request)
# 關閉方法
def close(self, reason):
if self.file:
self.file.close()
# 記錄日誌的方法封裝
def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
複製代碼
request_seenredis
enqueue_request算法