Scrapy 核心的代碼都在scrapy類庫的scrapy/core文件夾下css
(downloader 支持多種類型下載)spider,pipline,middleware 是本身編寫的正則表達式
...
...
# 此處爲執行過程當中1-2步,Engine拿到request後發送給Scheduler
def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled,
request=request, spider=spider)
# 調用scheduler的enqueue_request方法將request放到Scheduler中
if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signal=signals.request_dropped,
request=request, spider=spider)
...
...
# 此處爲執行過程當中第三步,從Engine中拿request給Scheduler
def _next_request_from_scheduler(self, spider):
slot = self.slot
request = slot.scheduler.next_request()
# 爬蟲首次啓動的時候先執行這個_next_request_from_scheduler方法,
# 可是scheduler裏此時沒有request,因此就會去從Spider中讀取start_urls
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
...
...
複製代碼
class Request(object_ref):
# url: 請求參數
# callback:請求回調函數
# method: http請求類型
# headers: 請求頭
# body:請求體
# cookies:瀏覽器cookie,自動登陸後,scrapy會自動把cookie加入request中
# 該操做的實現是由scrapy.downloadermiddlewares.cookies.CookiesMiddleware的scrapy內置Middleware完成的
# meta:元信息,(能夠在Request中傳遞)
# encoding:網頁編碼格式,默認UTF-8
# priority:設置在scheduler的調度優先級
# dont_filter:是否不過濾同時發出的相同request請求
# errback:失敗的回調函數
#
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None):
複製代碼
class Response(object_ref):
# url 網頁的url
# status 返回狀態碼,默認是200,表明成功
# headers 服務器返回的響應頭
# body 返回的內容體
# request 以前yield的Request,對應的請求
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
複製代碼
其子類有HtmlResponse,TextResponse,XmlResponsesql
from scrapy.http.response.text import TextResponse
class HtmlResponse(TextResponse):
pass
複製代碼
class TextResponse(Response):
...
...
# Response內部已經引入了selector拱xpath,css方法調用
@property
def selector(self):
from scrapy.selector import Selector
if self._cached_selector is None:
self._cached_selector = Selector(self)
return self._cached_selector
# xpath 選擇器
def xpath(self, query, **kwargs):
return self.selector.xpath(query, **kwargs)
# css 選擇器
def css(self, query):
return self.selector.css(query)
...
...
複製代碼