from_crawler(crawler, *args, **kwargs)
:這個就是優先於__init__執行函數舉例代碼能夠以下#通常配置數據庫的屬性時候稍微用影響 #簡單些下 @classmethod def from_crawler(cls,crawler): HOST = crawler.settings.get('HOST') #這裏面的屬性都是在settings中設置的名稱 PORT = crawler.settings.get('PORT') USER = crawler.settings.get('USER') PWD = crawler.settings.get('PWD') DB = crawler.settings.get('DB') TABLE = crawler.settings.get('TABLE') return cls(HOST,PORT,USER,PWD,DB,TABLE) def __init__(self,HOST,PORT,USER,PWD,DB,TABLE): self.HOST = HOST self.PORT = PORT self.USER = USER self.PWD = PWD self.DB = DB self.TABLE = TABLE #看一眼就知道了吧
start_requests(self)
:該方法用來發起第一個Requests請求,且必須返回一個可迭代的對象。它在爬蟲程序打開時就被Scrapy調用,Scrapy只調用它一次。舉例python
若是不寫start_requests
方法:他會把start_urls
的兩個網址都發送過去數據庫
import scrapy class BaiduSpider(scrapy.Spider): name = 'test' allowed_domains = ['http://httpbin.org/get'] start_urls = ['http://httpbin.org/get','http://httpbin.org/get'] def parse(self, response): print('接受一次')
若是寫start_requests
方法:他會把咱們指定的Request對象發送出去,發送必須以迭代器
的形式輸出dom
parse(self,response)
:這是默認的回調函數scrapy
log(self, message, level=logging.DEBUG, **kw):
定義日誌級別close(self,reason)
:關閉爬蟲程序執行ide