1.下載twisted:http://www.lfd.uci.edu/~gohlke/pythonlibs/ 2.pip3 install wheel 3.pip3 install Twisted‑18.4.0‑cp36‑cp36m‑win_amd64.whl #根據本身的版本找對應的版本進行安裝 4.pip3 install pywin32 5.pip3 install scrapy
pip3 install scrapy
2.scrapy的基本使用:Django與Scrapy的使用對比html
Django:python
# 建立django project django-admin startproject mysite cd mysite # 建立app python manage.py startapp app01 python manage.py startapp app02 # 啓動項目 python manage.runserver
Scrapy:react
# 建立scrapy project scrapy startproject cjk cd cjk # 建立爬蟲 scrapy genspider chouti chouti.com scrapy genspider cnblogs cnblogs.com # 啓動爬蟲 scrapy crawl chouti
1 Last login: Sat Jan 5 18:14:13 on ttys000 2 chenjunkandeMBP:~ chenjunkan$ scrapy 3 Scrapy 1.5.1 - no active project 4 5 Usage: 6 scrapy <command> [options] [args] 7 8 Available commands: 9 bench Run quick benchmark test 10 fetch Fetch a URL using the Scrapy downloader 11 genspider Generate new spider using pre-defined templates 12 runspider Run a self-contained spider (without creating a project) 13 settings Get settings values 14 shell Interactive scraping console 15 startproject Create new project 16 version Print Scrapy version 17 view Open URL in browser, as seen by Scrapy 18 19 [ more ] More commands available when run from project directory 20 21 Use "scrapy <command> -h" to see more info about a command
建立project scrapy startproject 項目名稱 項目名稱 項目名稱/ - spiders # 爬蟲文件 - chouti.py - cnblgos.py - items.py # 持久化 - pipelines # 持久化 - middlewares.py # 中間件 - settings.py # 配置文件(爬蟲) scrapy.cfg # 配置文件(部署)
# -*- coding: utf-8 -*- import scrapy class ChoutiSpider(scrapy.Spider): # 爬蟲的名稱 name = 'chouti' # 定向爬蟲,只能爬取域名是gig.chouti.com的網站才能爬取 allowed_domains = ['dig.chouti.com'] # 起始的url start_urls = ['http://dig.chouti.com/'] # 回調函數,起始的url執行結束以後就會自動去調這個函數 def parse(self, response): # <200 https://dig.chouti.com/> <class 'scrapy.http.response.html.HtmlResponse'> print(response, type(response))
3.scrapy基本原理了解(後面會詳細的介紹)linux
url_list = ['http://www.baidu.com','http://www.baidu.com','http://www.baidu.com',] for item in url_list: response = requests.get(item) print(response.text)
from twisted.web.client import getPage, defer from twisted.internet import reactor # 第一部分:代理開始接收任務 def callback(contents): print(contents) deferred_list = [] # 我須要請求的列表 url_list = ['http://www.bing.com', 'https://segmentfault.com/', 'https://stackoverflow.com/'] for url in url_list: # 不會立刻就將請求發出去,而是生成一個對象 deferred = getPage(bytes(url, encoding='utf8')) # 請求結束調用的回調函數 deferred.addCallback(callback) # 將全部的對象放入一個列表裏面 deferred_list.append(deferred) # 第二部分:代理執行完任務後,中止 dlist = defer.DeferredList(deferred_list) def all_done(arg): reactor.stop() dlist.addBoth(all_done) # 第三部分:代理開始去處理吧 reactor.run()
4.持久化web
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 5 class ChoutiSpider(scrapy.Spider): 6 # 爬蟲的名稱 7 name = 'chouti' 8 # 定向爬蟲,只能爬取域名是gig.chouti.com的網站才能爬取 9 allowed_domains = ['dig.chouti.com'] 10 # 起始的url 11 start_urls = ['http://dig.chouti.com/'] 12 13 # 回調函數,起始的url執行結束以後就會自動去調這個函數 14 def parse(self, response): 15 f = open('news.log', mode='a+') 16 item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') 17 for item in item_list: 18 text = item.xpath('.//a/text()').extract_first() 19 href = item.xpath('.//a/@href').extract_first() 20 print(href, text.strip()) 21 f.write(href + '\n') 22 f.close()
ITEM_PIPELINES = { 'cjk.pipelines.CjkPipeline': 300, }
class CjkscrapyPipeline(object): def process_item(self, item, spider): print("chenjunkan") return item
c.那麼咱們先要在Item.py這個文件裏面新增兩個字段,用來約束只須要傳這兩個字段就行redis
import scrapy class CjkscrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() href = scrapy.Field() title = scrapy.Field()
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from cjkscrapy.items import CjkscrapyItem 4 5 6 class ChoutiSpider(scrapy.Spider): 7 # 爬蟲的名稱 8 name = 'chouti' 9 # 定向爬蟲,只能爬取域名是gig.chouti.com的網站才能爬取 10 allowed_domains = ['dig.chouti.com'] 11 # 起始的url 12 start_urls = ['http://dig.chouti.com/'] 13 14 # 回調函數,起始的url執行結束以後就會自動去調這個函數 15 def parse(self, response): 16 item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') 17 for item in item_list: 18 text = item.xpath('.//a/text()').extract_first() 19 href = item.xpath('.//a/@href').extract_first() 20 yield CjkscrapyItem(href=href, title=text)
d.接下來pipelines裏面的process_item這個方法就會被自動的觸發,每yeild一次,process_item這個方法就被觸發一次;這個裏面的item就是咱們經過CjkItem建立的對象;那麼爲何咱們須要用CjkItem,由於咱們能夠經過這個類來給咱們CjkPipeline約束要取什麼數據進行持久化,item給咱們約定了什麼字段咱們就去取什麼字段;那麼def process_item(self, item, spider)這個方法裏面的spider值得是什麼呢?這個其實就是實例化的爬蟲,由於class ChoutiSpider(scrapy.Spider):這個爬蟲要想執行的話首先必須先實例化,說白了就是咱們當前爬蟲的對象,這個對象裏面有name,allowed_domains等一些屬性算法
e.所以咱們一個爬蟲當yield CjkItem的時候就是交給咱們的pipelines去處理,若是咱們yield Request那麼就是從新去下載shell
總結:django
1 a.先寫pipeline類 2 3 4 class XXXPipeline(object): 5 def process_item(self, item, spider): 6 return item 7 8 9 b.寫Item類 10 11 12 class XdbItem(scrapy.Item): 13 href = scrapy.Field() 14 title = scrapy.Field() 15 16 17 c.配置 18 ITEM_PIPELINES = { 19 'xdb.pipelines.XdbPipeline': 300, 20 } 21 22 d.爬蟲,yield每執行一次,process_item就調用一次。 23 24 yield Item對象
def process_item(self, item, spider): f = open("xx.log", "a+") f.write(item["href"]+"\n") f.close() return item
class CjkscrapyPipeline(object): def open_spider(self, spider): print("爬蟲開始了") self.f = open("new.log", "a+") def process_item(self, item, spider): self.f.write(item["href"] + "\n") return item def close_spider(self, spider): self.f.close() print("爬蟲結束了")
class CjkscrapyPipeline(object): def __init__(self): self.f = None def open_spider(self, spider): print("爬蟲開始了") self.f = open("new.log", "a+") def process_item(self, item, spider): self.f.write(item["href"] + "\n") return item def close_spider(self, spider): self.f.close() print("爬蟲結束了")
@classmethod def from_crawler(cls, crawler): print('File.from_crawler') path = crawler.settings.get('HREF_FILE_PATH') return cls(path)
class CjkscrapyPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): print('File.from_crawler') path = crawler.settings.get('HREF_FILE_PATH') return cls(path) def open_spider(self, spider): print("爬蟲開始了") self.f = open(self.path, "a+") def process_item(self, item, spider): self.f.write(item["href"] + "\n") return item def close_spider(self, spider): self.f.close() print("爬蟲結束了")
""" 源碼內容: 1. 判斷當前CjkPipeline類中是否有from_crawler 有: obj = CjkPipeline.from_crawler(....) 否: obj = CjkPipeline() 2. obj.open_spider() 3. obj.process_item()/obj.process_item()/obj.process_item()/obj.process_item()/obj.process_item() 4. obj.close_spider() """
# if spider.name == 'chouti':
5.去重規則segmentfault
# -*- coding: utf-8 -*- import scrapy from cjk.items import CjkItem class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['dig.chouti.com'] start_urls = ['http://dig.chouti.com/'] def parse(self, response): print(response.request.url) # item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # for item in item_list: # text = item.xpath('.//a/text()').extract_first() # href = item.xpath('.//a/@href').extract_first() page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy.http import Request page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.parse) # https://dig.chouti.com/all/hot/recent/2
1 class RFPDupeFilter(BaseDupeFilter): 2 """Request Fingerprint duplicates filter""" 3 4 def __init__(self, path=None, debug=False): 5 self.file = None 6 self.fingerprints = set() 7 self.logdupes = True 8 self.debug = debug 9 self.logger = logging.getLogger(__name__) 10 if path: 11 self.file = open(os.path.join(path, 'requests.seen'), 'a+') 12 self.file.seek(0) 13 self.fingerprints.update(x.rstrip() for x in self.file) 14 15 @classmethod 16 def from_settings(cls, settings): 17 debug = settings.getbool('DUPEFILTER_DEBUG') 18 return cls(job_dir(settings), debug) 19 20 def request_seen(self, request): 21 fp = self.request_fingerprint(request) 22 if fp in self.fingerprints: 23 return True 24 self.fingerprints.add(fp) 25 if self.file: 26 self.file.write(fp + os.linesep) 27 28 def request_fingerprint(self, request): 29 return request_fingerprint(request) 30 31 def close(self, reason): 32 if self.file: 33 self.file.close() 34 35 def log(self, request, spider): 36 if self.debug: 37 msg = "Filtered duplicate request: %(request)s" 38 self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 39 elif self.logdupes: 40 msg = ("Filtered duplicate request: %(request)s" 41 " - no more duplicates will be shown" 42 " (see DUPEFILTER_DEBUG to show all duplicates)") 43 self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 44 self.logdupes = False 45 46 spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
def request_seen(self, request): fp = self.request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep)
1 1.首先執行from_setting去配置文件找路徑,要在配置文件裏面將debug設置爲True; 2 @classmethod 3 def from_settings(cls, settings): 4 debug = settings.getbool('DUPEFILTER_DEBUG') 5 return cls(job_dir(settings), debug) 6 2.返回的是return cls(job_dir(settings), debug), 7 import os 8 9 def job_dir(settings): 10 path = settings['JOBDIR'] 11 if path and not os.path.exists(path): 12 os.makedirs(path) 13 return path 14 若是有path那就從setting裏面去取,設置JOBDIR,若是沒有的話那麼建立,最後返回的是path,也就是配置文件裏面的JOBDIR對應的值
可是咱們通常的話寫在文件裏面的話意義不是很大,我後面會介紹將其寫在redis裏面
自定義去重規則:由於內置的去重規則RFPDupeFilter這個類是繼承BaseDupeFilter的,所以咱們自定義的時候也能夠繼承這個基類
from scrapy.dupefilter import BaseDupeFilter from scrapy.utils.request import request_fingerprint class CjkDupeFilter(BaseDupeFilter): def __init__(self): self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): fd = request_fingerprint(request=request) if fd in self.visited_fd: return True self.visited_fd.add(fd) def open(self): # can return deferred print('開始') def close(self, reason): # can return a deferred print('結束') # def log(self, request, spider): # log that a request has been filtered # print('日誌')
系統默認的去重規則配置:DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
咱們須要在setting配置文件裏面將咱們自定義的去重規則寫上:DUPEFILTER_CLASS = 'cjkscrapy.dupefilters.CjkDupeFilter'
eg:後面咱們會在redis裏面作去重規則
注意:在咱們的爬蟲文件也能夠本身設置去重規則,在Request裏面有個dont_filter這個參數,默認爲False表示遵循過濾規則,若是將dont_filter這個參數設置爲True那麼表示不遵循去重規則
class Request(object_ref): def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None):
6.depth深度控制(後面會詳細說明)
# -*- coding: utf-8 -*- import scrapy from scrapy.dupefilter import RFPDupeFilter class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['dig.chouti.com'] start_urls = ['http://dig.chouti.com/'] def parse(self, response): print(response.request.url, response.meta.get("depth", 0)) # item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # for item in item_list: # text = item.xpath('.//a/text()').extract_first() # href = item.xpath('.//a/@href').extract_first() page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy.http import Request page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.parse, dont_filter=False) # https://dig.chouti.com/all/hot/recent/2
深度指的是爬蟲爬取的深度,若是咱們想控制深度,那麼咱們能夠在setting配置文件裏面設置DEPTH_LIMIT = 3
7.手動處理cookie
Request對象裏面參數查看:
class Request(object_ref): def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None):
咱們暫時先關注Request裏面的這個參數url, callback=None, method='GET', headers=None, body=None,cookies=None;發請求其實就至關於請求頭+請求體+cookie
自動登陸抽屜示例:
咱們第一次請求抽屜的時候咱們會獲得一個cookie,咱們若是去響應裏面去拿到cookie呢?
首先導入一個處理cookie的類:from scrapy.http.cookies import CookieJar
# -*- coding: utf-8 -*- import scrapy from scrapy.dupefilter import RFPDupeFilter from scrapy.http.cookies import CookieJar class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['dig.chouti.com'] start_urls = ['http://dig.chouti.com/'] def parse(self, response): cookie_dict = {} # 去響應頭中獲取cookie,cookie保存在cookie_jar對象 cookie_jar = CookieJar() # 去response, response.request裏面拿到cookie cookie_jar.extract_cookies(response, response.request) # 去對象中將cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): cookie_dict[m] = n.value print(cookie_dict)
運行上面的代碼:咱們能拿到cookie
chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog /Users/chenjunkan/Desktop/scrapytest/cjkscrapy/cjkscrapy/spiders/chouti.py:3: ScrapyDeprecationWarning: Module `scrapy.dupefilter` is deprecated, use `scrapy.dupefilters` instead from scrapy.dupefilter import RFPDupeFilter {'gpsd': 'd052f4974404d8c431f3c7c1615694c4', 'JSESSIONID': 'aaaUxbbxMYOWh4T7S7rGw'}
自動登陸抽屜而且點讚的完整示例:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.dupefilter import RFPDupeFilter 4 from scrapy.http.cookies import CookieJar 5 from scrapy.http import Request 6 7 8 class ChoutiSpider(scrapy.Spider): 9 name = 'chouti' 10 allowed_domains = ['dig.chouti.com'] 11 start_urls = ['http://dig.chouti.com/'] 12 cookie_dict = {} 13 14 def parse(self, response): 15 16 # 去響應頭中獲取cookie,cookie保存在cookie_jar對象 17 cookie_jar = CookieJar() 18 # 去response, response.request裏面拿到cookie 19 cookie_jar.extract_cookies(response, response.request) 20 21 # 去對象中將cookie解析到字典 22 for k, v in cookie_jar._cookies.items(): 23 for i, j in v.items(): 24 for m, n in j.items(): 25 self.cookie_dict[m] = n.value 26 print(self.cookie_dict) 27 28 yield Request( 29 url='https://dig.chouti.com/login', 30 method='POST', 31 body="phone=8618357186730&password=cjk139511&oneMonth=1", 32 cookies=self.cookie_dict, 33 headers={ 34 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' 35 }, 36 callback=self.check_login 37 ) 38 39 def check_login(self, response): 40 print(response.text) 41 yield Request( 42 url='https://dig.chouti.com/all/hot/recent/1', 43 cookies=self.cookie_dict, 44 callback=self.index 45 ) 46 47 def index(self, response): 48 news_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') 49 for new in news_list: 50 link_id = new.xpath('.//div[@class="part2"]/@share-linkid').extract_first() 51 yield Request( 52 url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,), 53 method='POST', 54 cookies=self.cookie_dict, 55 callback=self.check_result 56 ) 57 # 全部的頁面所有點贊 58 page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() 59 for page in page_list: 60 page = "https://dig.chouti.com" + page 61 yield Request(url=page, callback=self.index) # https://dig.chouti.com/all/hot/recent/2 62 63 def check_result(self, response): 64 print(response.text)
運行結果:
1 chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog 2 /Users/chenjunkan/Desktop/scrapytest/cjkscrapy/cjkscrapy/spiders/chouti.py:3: ScrapyDeprecationWarning: Module `scrapy.dupefilter` is deprecated, use `scrapy.dupefilters` instead 3 from scrapy.dupefilter import RFPDupeFilter 4 {'gpsd': '78613f08c985435d5d0eedc08b0ed812', 'JSESSIONID': 'aaaTmxnFAJJGn9Muf8rGw'} 5 {"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_53587312848"}}} 6 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112533000","lvCount":"6","nick":"chenjunkan","uvCount":"26","voteTime":"小於1分鐘前"}}} 7 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112720000","lvCount":"28","nick":"chenjunkan","uvCount":"27","voteTime":"小於1分鐘前"}}} 8 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112862000","lvCount":"24","nick":"chenjunkan","uvCount":"33","voteTime":"小於1分鐘前"}}} 9 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112849000","lvCount":"29","nick":"chenjunkan","uvCount":"33","voteTime":"小於1分鐘前"}}} 10 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112872000","lvCount":"48","nick":"chenjunkan","uvCount":"33","voteTime":"小於1分鐘前"}}} 11 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112877000","lvCount":"23","nick":"chenjunkan","uvCount":"33","voteTime":"小於1分鐘前"}}} 12 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112877000","lvCount":"69","nick":"chenjunkan","uvCount":"33","voteTime":"小於1分鐘前"}}} 13 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112877000","lvCount":"189","nick":"chenjunkan","uvCount":"33","voteTime":"小於1分鐘前"}}} 14 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112926000","lvCount":"98","nick":"chenjunkan","uvCount":"35","voteTime":"小於1分鐘前"}}} 15 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692112951000","lvCount":"61","nick":"chenjunkan","uvCount":"35","voteTime":"小於1分鐘前"}}} 16 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113086000","lvCount":"13","nick":"chenjunkan","uvCount":"37","voteTime":"小於1分鐘前"}}} 17 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113097000","lvCount":"17","nick":"chenjunkan","uvCount":"38","voteTime":"小於1分鐘前"}}} 18 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113118000","lvCount":"21","nick":"chenjunkan","uvCount":"41","voteTime":"小於1分鐘前"}}} 19 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113155000","lvCount":"86","nick":"chenjunkan","uvCount":"41","voteTime":"小於1分鐘前"}}} 20 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113140000","lvCount":"22","nick":"chenjunkan","uvCount":"41","voteTime":"小於1分鐘前"}}} 21 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113148000","lvCount":"25","nick":"chenjunkan","uvCount":"41","voteTime":"小於1分鐘前"}}} 22 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113416000","lvCount":"22","nick":"chenjunkan","uvCount":"47","voteTime":"小於1分鐘前"}}} 23 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113386000","lvCount":"13","nick":"chenjunkan","uvCount":"46","voteTime":"小於1分鐘前"}}} 24 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113381000","lvCount":"70","nick":"chenjunkan","uvCount":"46","voteTime":"小於1分鐘前"}}} 25 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113408000","lvCount":"27","nick":"chenjunkan","uvCount":"47","voteTime":"小於1分鐘前"}}} 26 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113402000","lvCount":"17","nick":"chenjunkan","uvCount":"47","voteTime":"小於1分鐘前"}}} 27 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113428000","lvCount":"41","nick":"chenjunkan","uvCount":"47","voteTime":"小於1分鐘前"}}} 28 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113528000","lvCount":"55","nick":"chenjunkan","uvCount":"49","voteTime":"小於1分鐘前"}}} 29 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113544000","lvCount":"16","nick":"chenjunkan","uvCount":"49","voteTime":"小於1分鐘前"}}} 30 {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53587312848","likedTime":"1546692113643000","lvCount":"22","nick":"chenjunkan","uvCount":"50","voteTime":"小於1分鐘前"}}}
8.起始請求定製
在提及始url以前咱們先來了解一個可迭代對象和迭代器,什麼是可迭代對象?列表就是可迭代對象
# 可迭代對象 # li = [11,22,33] # 可迭代對象轉換爲迭代器 # iter(li)
生成器轉換爲迭代器,生成器是一種特殊的迭代器
def func(): yield 11 yield 22 yield 33 # 生成器 li = func() # 生成器 轉換爲迭代器 iter(li) v = li.__next__() print(v)
咱們知道爬蟲裏面的調度器只能接收request對象,而後交給下載器去下載,所以當爬蟲啓動的時候在源碼的內部開始不是先去執行parse方法,而是先把start_urls裏面的每個url都封裝成一個一個的request對象,而後將request對象交給調度器;具體流程以下:
""" scrapy引擎來爬蟲中取起始URL: 1. 調用start_requests並獲取返回值 2. v = iter(返回值) 返回值是生成器或者迭代器都不要緊,由於會轉換爲迭代器 3. req1 = 執行 v.__next__() req2 = 執行 v.__next__() req3 = 執行 v.__next__() ... 4. req所有放到調度器中 """
說明:當調度器來拿request對象的時候並非直接經過start_urls拿,而是調用內部的start_requests這個方法,若是本類沒有的話就去父類拿
start_requests方法的兩種方式:
方式一:start_requests方法是一個生成器函數,在內部會將這個生成器轉換爲迭代器
def start_requests(self): # 方式一: for url in self.start_urls: yield Request(url=url)
方式二:start_requests方法返回一個可迭代的對象
def start_requests(self): # 方式二: req_list = [] for url in self.start_urls: req_list.append(Request(url=url)) return req_list
當咱們本身不定義start_requests的時候其實咱們的源碼裏面也是有這個方法的:進入scrapy.Spider
查看源碼:
def start_requests(self): cls = self.__class__ if method_is_overridden(cls, Spider, 'make_requests_from_url'): warnings.warn( "Spider.make_requests_from_url method is deprecated; it " "won't be called in future Scrapy releases. Please " "override Spider.start_requests method instead (see %s.%s)." % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)
若是咱們本身定義了start_requests,那麼咱們本身的優先級會更高,默認的狀況下發的是get請求,那麼這樣咱們能夠本身改爲post的請求
eg:起始的url能夠去redis中獲取
9.深度和優先級
# -*- coding: utf-8 -*- import scrapy from scrapy.dupefilter import RFPDupeFilter from scrapy.http.cookies import CookieJar from scrapy.http import Request class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['dig.chouti.com'] start_urls = ['http://dig.chouti.com/'] def start_requests(self): for url in self.start_urls: yield Request(url=url, callback=self.parse) def parse(self, response): print(response.meta.get("depth", 0)) page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.parse) # https://dig.chouti.com/all/hot/recent/2
當咱們在執行yield Request的時候其實咱們將request對象放在調度器中去,而後交給下載器進行下載,其中還須要進過不少中間件,所以在中間件裏面還能夠作點其餘的操做:導入from scrapy.spidermiddlewares.depth import DepthMiddleware
DepthMiddleware:
class DepthMiddleware(object): def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1): self.maxdepth = maxdepth self.stats = stats self.verbose_stats = verbose_stats self.prio = prio @classmethod def from_crawler(cls, crawler): settings = crawler.settings maxdepth = settings.getint('DEPTH_LIMIT') verbose = settings.getbool('DEPTH_STATS_VERBOSE') prio = settings.getint('DEPTH_PRIORITY') return cls(maxdepth, crawler.stats, verbose, prio) def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request): depth = response.meta['depth'] + 1 request.meta['depth'] = depth if self.prio: request.priority -= depth * self.prio if self.maxdepth and depth > self.maxdepth: logger.debug( "Ignoring link (depth > %(maxdepth)d): %(requrl)s ", {'maxdepth': self.maxdepth, 'requrl': request.url}, extra={'spider': spider} ) return False elif self.stats: if self.verbose_stats: self.stats.inc_value('request_depth_count/%s' % depth, spider=spider) self.stats.max_value('request_depth_max', depth, spider=spider) return True # base case (depth=0) if self.stats and 'depth' not in response.meta: response.meta['depth'] = 0 if self.verbose_stats: self.stats.inc_value('request_depth_count/0', spider=spider) return (r for r in result or () if _filter(r))
當通過中間件的時候穿過的就是process_spider_output方法:
說明:
1.if self.stats and 'depth' not in response.meta: 第一次將請求發送過去的時候Request裏面的meta=None;返回的reponse裏面的meta也是空的,在response裏面response.request中的request就是咱們開始封裝的request;導入from scrapy.http import response,咱們會發現response.meta: @property def meta(self): try: return self.request.meta except AttributeError: raise AttributeError( "Response.meta not available, this response " "is not tied to any request" ) 能夠得出:response.meta = response.request.meta 2.當咱們yield Request的時候這個時候就會觸發process_spider_output這個函數,當條件成立以後就會給request.meta賦值:response.meta['depth'] = 0 3.return (r for r in result or () if _filter(r)):result指的是返回的一個一個的request對象,循環每個request對象,執行_filter方法 4.若是_filter返回True,表示這個請求是容許的,交給調度器,若是返回False,那麼就丟棄 if isinstance(request, Request): depth = response.meta['depth'] + 1 request.meta['depth'] = depth 若是request是Request對象,那麼就深度+1; 5.if self.maxdepth and depth > self.maxdepth: maxdepth = settings.getint('DEPTH_LIMIT')若是深度大於咱們本身設置的深度,那麼就丟棄
優先級策略:深度越深那麼優先級就越低,默認的優先級爲0
if self.prio: request.priority -= depth * self.prio
10.代理
scrapy內置代理:當調度器將url交給下載器進行下載的時候,那麼中間會進過中間件,內置的代理就在這些中間件裏面
路徑:/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/downloadermiddlewares/httpproxy.py
HttpProxyMiddleware類:
1 import base64 2 from six.moves.urllib.request import getproxies, proxy_bypass 3 from six.moves.urllib.parse import unquote 4 try: 5 from urllib2 import _parse_proxy 6 except ImportError: 7 from urllib.request import _parse_proxy 8 from six.moves.urllib.parse import urlunparse 9 10 from scrapy.utils.httpobj import urlparse_cached 11 from scrapy.exceptions import NotConfigured 12 from scrapy.utils.python import to_bytes 13 14 15 class HttpProxyMiddleware(object): 16 17 def __init__(self, auth_encoding='latin-1'): 18 self.auth_encoding = auth_encoding 19 self.proxies = {} 20 for type, url in getproxies().items(): 21 self.proxies[type] = self._get_proxy(url, type) 22 23 @classmethod 24 def from_crawler(cls, crawler): 25 if not crawler.settings.getbool('HTTPPROXY_ENABLED'): 26 raise NotConfigured 27 auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING') 28 return cls(auth_encoding) 29 30 def _basic_auth_header(self, username, password): 31 user_pass = to_bytes( 32 '%s:%s' % (unquote(username), unquote(password)), 33 encoding=self.auth_encoding) 34 return base64.b64encode(user_pass).strip() 35 36 def _get_proxy(self, url, orig_type): 37 proxy_type, user, password, hostport = _parse_proxy(url) 38 proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) 39 40 if user: 41 creds = self._basic_auth_header(user, password) 42 else: 43 creds = None 44 45 return creds, proxy_url 46 47 def process_request(self, request, spider): 48 # ignore if proxy is already set 49 if 'proxy' in request.meta: 50 if request.meta['proxy'] is None: 51 return 52 # extract credentials if present 53 creds, proxy_url = self._get_proxy(request.meta['proxy'], '') 54 request.meta['proxy'] = proxy_url 55 if creds and not request.headers.get('Proxy-Authorization'): 56 request.headers['Proxy-Authorization'] = b'Basic ' + creds 57 return 58 elif not self.proxies: 59 return 60 61 parsed = urlparse_cached(request) 62 scheme = parsed.scheme 63 64 # 'no_proxy' is only supported by http schemes 65 if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): 66 return 67 68 if scheme in self.proxies: 69 self._set_proxy(request, scheme) 70 71 def _set_proxy(self, request, scheme): 72 creds, proxy = self.proxies[scheme] 73 request.meta['proxy'] = proxy 74 if creds: 75 request.headers['Proxy-Authorization'] = b'Basic ' + creds
def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
說明:
if scheme in self.proxies: self._set_proxy(request, scheme)
self.proxy是什麼?是一個self.proxies = {}空的字典:加代理的本質就是在請求頭上加點數據
for type, url in getproxies().items(): self.proxies[type] = self._get_proxy(url, type)
接下來:getproxies = getproxies_environment
def getproxies_environment(): """Return a dictionary of scheme -> proxy server URL mappings. Scan the environment for variables named <scheme>_proxy; this seems to be the standard convention. If you need a different way, you can pass a proxies dictionary to the [Fancy]URLopener constructor. """ proxies = {} # in order to prefer lowercase variables, process environment in # two passes: first matches any, second pass matches lowercase only for name, value in os.environ.items(): name = name.lower() if value and name[-6:] == '_proxy': proxies[name[:-6]] = value # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY # (non-all-lowercase) as it may be set from the web server by a "Proxy:" # header from the client # If "proxy" is lowercase, it will still be used thanks to the next block if 'REQUEST_METHOD' in os.environ: proxies.pop('http', None) for name, value in os.environ.items(): if name[-6:] == '_proxy': name = name.lower() if value: proxies[name[:-6]] = value else: proxies.pop(name[:-6], None) return proxies
說明:a.經過os.environ從環境變量裏面去獲取代理,只要遵循後六位是_proxy結尾就能夠,也能夠經過get方法設置代理
import os v = os.environ.get('XXX') print(v)
b.經過這個方法咱們可以獲得一個
proxies = {
http:'192.168.3.3'
https:'192.168.3.4'
}
c.接下來self.proxies[type] = self._get_proxy(url, type)
def _get_proxy(self, url, orig_type): proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None return creds, proxy_url
內置代理的寫法:
在爬蟲啓動時,提早在os.envrion中設置代理便可。 class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def start_requests(self): import os os.environ['HTTPS_PROXY'] = "http://root:root@192.168.11.11:9999/" os.environ['HTTP_PROXY'] = '19.11.2.32', for url in self.start_urls: yield Request(url=url,callback=self.parse)
可是咱們還有另外的一種方式設置內置代理,在爬蟲裏面去設置meta
源碼查看:
if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return
meta寫法
class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def start_requests(self): for url in self.start_urls: yield Request(url=url,callback=self.parse,meta={'proxy':'"http://root:root@192.168.11.11:9999/"'})
自定義代理:
代碼示例:
import base64 import random from six.moves.urllib.parse import unquote try: from urllib2 import _parse_proxy except ImportError: from urllib.request import _parse_proxy from six.moves.urllib.parse import urlunparse from scrapy.utils.python import to_bytes class CjkProxyMiddleware(object): def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding='latin-1') return base64.b64encode(user_pass).strip() def process_request(self, request, spider): PROXIES = [ "http://root:root@192.168.11.11:9999/", "http://root:root@192.168.11.12:9999/", "http://root:root@192.168.11.13:9999/", "http://root:root@192.168.11.14:9999/", "http://root:root@192.168.11.15:9999/", "http://root:root@192.168.11.16:9999/", ] url = random.choice(PROXIES) orig_type = "" proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None request.meta['proxy'] = proxy_url if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds
11.解析器(後面會詳細介紹)
代碼示例:
html = """<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html">first item</a></li> <li class="item-0"><a id='i2' href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body> </html> """ from scrapy.http import HtmlResponse from scrapy.selector import Selector response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') # hxs = Selector(response) # hxs.xpath() response.xpath('')
12.下載中間件
源碼查看:
1 class CjkscrapyDownloaderMiddleware(object): 2 # Not all methods need to be defined. If a method is not defined, 3 # scrapy acts as if the downloader middleware does not modify the 4 # passed objects. 5 6 @classmethod 7 def from_crawler(cls, crawler): 8 # This method is used by Scrapy to create your spiders. 9 s = cls() 10 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 11 return s 12 13 def process_request(self, request, spider): 14 # Called for each request that goes through the downloader 15 # middleware. 16 17 # Must either: 18 # - return None: continue processing this request 19 # - or return a Response object 20 # - or return a Request object 21 # - or raise IgnoreRequest: process_exception() methods of 22 # installed downloader middleware will be called 23 return None 24 25 def process_response(self, request, response, spider): 26 # Called with the response returned from the downloader. 27 28 # Must either; 29 # - return a Response object 30 # - return a Request object 31 # - or raise IgnoreRequest 32 return response 33 34 def process_exception(self, request, exception, spider): 35 # Called when a download handler or a process_request() 36 # (from other downloader middleware) raises an exception. 37 38 # Must either: 39 # - return None: continue processing this exception 40 # - return a Response object: stops process_exception() chain 41 # - return a Request object: stops process_exception() chain 42 pass 43 44 def spider_opened(self, spider): 45 spider.logger.info('Spider opened: %s' % spider.name)
自定義下載中間件:先定義一個md文件,而後在setting裏面設置
DOWNLOADER_MIDDLEWARES = { # 'cjkscrapy.middlewares.CjkscrapyDownloaderMiddleware': 543, 'cjkscrapy.md.Md1': 666, 'cjkscrapy.md.Md2': 667, }
md.py
class Md1(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md1.process_request', request) pass def process_response(self, request, response, spider): print('m1.process_response', request, response) return response def process_exception(self, request, exception, spider): pass class Md2(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md2.process_request', request) def process_response(self, request, response, spider): print('m2.process_response', request, response) return response def process_exception(self, request, exception, spider): pass
運行結果:
1 chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog 2 md1.process_request <GET http://dig.chouti.com/> 3 md2.process_request <GET http://dig.chouti.com/> 4 m2.process_response <GET http://dig.chouti.com/> <301 http://dig.chouti.com/> 5 m1.process_response <GET http://dig.chouti.com/> <301 http://dig.chouti.com/> 6 md1.process_request <GET https://dig.chouti.com/> 7 md2.process_request <GET https://dig.chouti.com/> 8 m2.process_response <GET https://dig.chouti.com/> <200 https://dig.chouti.com/> 9 m1.process_response <GET https://dig.chouti.com/> <200 https://dig.chouti.com/> 10 response <200 https://dig.chouti.com/>
a.返回Response對象
from scrapy.http import HtmlResponse from scrapy.http import Request class Md1(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md1.process_request', request) # 1. 返回Response import requests result = requests.get(request.url) return HtmlResponse(url=request.url, status=200, headers=None, body=result.content) def process_response(self, request, response, spider): print('m1.process_response', request, response) return response def process_exception(self, request, exception, spider): pass class Md2(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md2.process_request', request) def process_response(self, request, response, spider): print('m2.process_response', request, response) return response def process_exception(self, request, exception, spider): pass
運行結果:
1 chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog 2 md1.process_request <GET http://dig.chouti.com/> 3 m2.process_response <GET http://dig.chouti.com/> <200 http://dig.chouti.com/> 4 m1.process_response <GET http://dig.chouti.com/> <200 http://dig.chouti.com/> 5 response <200 http://dig.chouti.com/>
說明:經過結果能夠看出,在這一步能夠僞造下載的結果,直接把結果當成返回值交給下一個處理
b.返回Request
from scrapy.http import HtmlResponse from scrapy.http import Request class Md1(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md1.process_request', request) # 2. 返回Request return Request('https://dig.chouti.com/r/tec/hot/1') def process_response(self, request, response, spider): print('m1.process_response', request, response) return response def process_exception(self, request, exception, spider): pass class Md2(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md2.process_request', request) def process_response(self, request, response, spider): print('m2.process_response', request, response) return response def process_exception(self, request, exception, spider): pass
運行結果:
chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog md1.process_request <GET http://dig.chouti.com/> md1.process_request <GET https://dig.chouti.com/r/tec/hot/1>
c.拋出異常:必需要有process_exception
from scrapy.http import HtmlResponse from scrapy.http import Request class Md1(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md1.process_request', request) # 3. 拋出異常 from scrapy.exceptions import IgnoreRequest raise IgnoreRequest def process_response(self, request, response, spider): print('m1.process_response', request, response) return response def process_exception(self, request, exception, spider): pass class Md2(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md2.process_request', request) def process_response(self, request, response, spider): print('m2.process_response', request, response) return response def process_exception(self, request, exception, spider): pass
運行結果:
chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog
md1.process_request <GET http://dig.chouti.com/>
d.上面的幾種咱們通常都不用,而是對請求進行加工(cookie)
from scrapy.http import HtmlResponse from scrapy.http import Request class Md1(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md1.process_request', request) request.headers[ 'user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" def process_response(self, request, response, spider): print('m1.process_response', request, response) return response def process_exception(self, request, exception, spider): pass class Md2(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): print('md2.process_request', request) def process_response(self, request, response, spider): print('m2.process_response', request, response) return response def process_exception(self, request, exception, spider): pass
運行結果:
1 chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog 2 md1.process_request <GET http://dig.chouti.com/> 3 md2.process_request <GET http://dig.chouti.com/> 4 m2.process_response <GET http://dig.chouti.com/> <301 http://dig.chouti.com/> 5 m1.process_response <GET http://dig.chouti.com/> <301 http://dig.chouti.com/> 6 md1.process_request <GET https://dig.chouti.com/> 7 md2.process_request <GET https://dig.chouti.com/> 8 m2.process_response <GET https://dig.chouti.com/> <200 https://dig.chouti.com/> 9 m1.process_response <GET https://dig.chouti.com/> <200 https://dig.chouti.com/> 10 response <200 https://dig.chouti.com/>
可是這個咱們不須要本身寫,源碼裏面幫咱們寫好了:
路徑:/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/downloadermiddlewares/useragent.py
源碼查看:
"""Set User-Agent header per spider or use a default value from settings""" from scrapy import signals class UserAgentMiddleware(object): """This middleware allows spiders to override the user_agent""" def __init__(self, user_agent='Scrapy'): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): o = cls(crawler.settings['USER_AGENT']) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o def spider_opened(self, spider): self.user_agent = getattr(spider, 'user_agent', self.user_agent) def process_request(self, request, spider): if self.user_agent: request.headers.setdefault(b'User-Agent', self.user_agent)
13.爬蟲中間件
源碼查看:
1 class CjkscrapySpiderMiddleware(object): 2 # Not all methods need to be defined. If a method is not defined, 3 # scrapy acts as if the spider middleware does not modify the 4 # passed objects. 5 6 @classmethod 7 def from_crawler(cls, crawler): 8 # This method is used by Scrapy to create your spiders. 9 s = cls() 10 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 11 return s 12 13 def process_spider_input(self, response, spider): 14 # Called for each response that goes through the spider 15 # middleware and into the spider. 16 17 # Should return None or raise an exception. 18 return None 19 20 def process_spider_output(self, response, result, spider): 21 # Called with the results returned from the Spider, after 22 # it has processed the response. 23 24 # Must return an iterable of Request, dict or Item objects. 25 for i in result: 26 yield i 27 28 def process_spider_exception(self, response, exception, spider): 29 # Called when a spider or process_spider_input() method 30 # (from other spider middleware) raises an exception. 31 32 # Should return either None or an iterable of Response, dict 33 # or Item objects. 34 pass 35 36 def process_start_requests(self, start_requests, spider): 37 # Called with the start requests of the spider, and works 38 # similarly to the process_spider_output() method, except 39 # that it doesn’t have a response associated. 40 41 # Must return only requests (not items). 42 for r in start_requests: 43 yield r 44 45 def spider_opened(self, spider): 46 spider.logger.info('Spider opened: %s' % spider.name)
自定義爬蟲中間件:
在配置文件中設置:
SPIDER_MIDDLEWARES = { # 'cjkscrapy.middlewares.CjkscrapySpiderMiddleware': 543, 'cjkscrapy.sd.Sd1': 667, 'cjkscrapy.sd.Sd2': 668, }
sd.py:
1 class Sd1(object): 2 # Not all methods need to be defined. If a method is not defined, 3 # scrapy acts as if the spider middleware does not modify the 4 # passed objects. 5 6 @classmethod 7 def from_crawler(cls, crawler): 8 # This method is used by Scrapy to create your spiders. 9 s = cls() 10 return s 11 12 def process_spider_input(self, response, spider): 13 # Called for each response that goes through the spider 14 # middleware and into the spider. 15 16 # Should return None or raise an exception. 17 return None 18 19 def process_spider_output(self, response, result, spider): 20 # Called with the results returned from the Spider, after 21 # it has processed the response. 22 23 # Must return an iterable of Request, dict or Item objects. 24 for i in result: 25 yield i 26 27 def process_spider_exception(self, response, exception, spider): 28 # Called when a spider or process_spider_input() method 29 # (from other spider middleware) raises an exception. 30 31 # Should return either None or an iterable of Response, dict 32 # or Item objects. 33 pass 34 35 # 只在爬蟲啓動時,執行一次。 36 def process_start_requests(self, start_requests, spider): 37 # Called with the start requests of the spider, and works 38 # similarly to the process_spider_output() method, except 39 # that it doesn’t have a response associated. 40 41 # Must return only requests (not items). 42 for r in start_requests: 43 yield r 44 45 46 class Sd2(object): 47 # Not all methods need to be defined. If a method is not defined, 48 # scrapy acts as if the spider middleware does not modify the 49 # passed objects. 50 51 @classmethod 52 def from_crawler(cls, crawler): 53 # This method is used by Scrapy to create your spiders. 54 s = cls() 55 return s 56 57 def process_spider_input(self, response, spider): 58 # Called for each response that goes through the spider 59 # middleware and into the spider. 60 61 # Should return None or raise an exception. 62 return None 63 64 def process_spider_output(self, response, result, spider): 65 # Called with the results returned from the Spider, after 66 # it has processed the response. 67 68 # Must return an iterable of Request, dict or Item objects. 69 for i in result: 70 yield i 71 72 def process_spider_exception(self, response, exception, spider): 73 # Called when a spider or process_spider_input() method 74 # (from other spider middleware) raises an exception. 75 76 # Should return either None or an iterable of Response, dict 77 # or Item objects. 78 pass 79 80 # 只在爬蟲啓動時,執行一次。 81 def process_start_requests(self, start_requests, spider): 82 # Called with the start requests of the spider, and works 83 # similarly to the process_spider_output() method, except 84 # that it doesn’t have a response associated. 85 86 # Must return only requests (not items). 87 for r in start_requests: 88 yield r
通常狀況下這個咱們不作修改,使用內置的,在爬蟲中間件中先執行全部的的input方法,再執行全部的output方法
14.定製命令
咱們以前全部的運行爬蟲都是在命令行中運行的,很是的麻煩,那麼咱們能夠定製一個腳本用來執行這些命令,在根目錄上寫上
start.py:
import sys from scrapy.cmdline import execute if __name__ == '__main__': # execute(["scrapy","crawl","chouti","--nolog"]) #單個爬蟲
自定製命令:
from scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def run(self, args, opts): spider_list = self.crawler_process.spiders.list() for name in spider_list: self.crawler_process.crawl(name, **opts.__dict__) self.crawler_process.start() crawlall.py
咱們查看scrapy --help,咱們添加的命令就在其中了
1 chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy --help 2 Scrapy 1.5.1 - project: cjkscrapy 3 4 Usage: 5 scrapy <command> [options] [args] 6 7 Available commands: 8 bench Run quick benchmark test 9 check Check spider contracts 10 crawl Run a spider 11 crawlall Runs all of the spiders 12 edit Edit spider 13 fetch Fetch a URL using the Scrapy downloader 14 genspider Generate new spider using pre-defined templates 15 list List available spiders 16 parse Parse URL (using its spider) and print the results 17 runspider Run a self-contained spider (without creating a project) 18 settings Get settings values 19 shell Interactive scraping console 20 startproject Create new project 21 version Print Scrapy version 22 view Open URL in browser, as seen by Scrapy
15.scrapy信號
新建一個文件:ext.py
from scrapy import signals class MyExtend(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): self = cls() crawler.signals.connect(self.x1, signal=signals.spider_opened) crawler.signals.connect(self.x2, signal=signals.spider_closed) return self def x1(self, spider): print('open') def x2(self, spider): print('close')
在setting配置文件裏面設置:
EXTENSIONS = { 'scrapy.extensions.telnet.TelnetConsole': None, 'cjkscrapy.ext.MyExtend': 666 }
說明:crawler.signals.connect(self.x1, signal=signals.spider_opened)表示將x1這個函數註冊到spider_opened裏面
#與引擎相關的 engine_started = object() engine_stopped = object() #與爬蟲相關的 spider_opened = object() spider_idle = object() #爬蟲閒置的時候 spider_closed = object() spider_error = object() #與請求相關的 request_scheduled = object()#將請求放在調度器的時候 request_dropped = object()#將請求丟失的時候 #與相應相關的 response_received = object() response_downloaded = object() #與item相關的 item_scraped = object() item_dropped = object()
運行結果:
chenjunkandeMBP:cjkscrapy chenjunkan$ scrapy crawl chouti --nolog open爬蟲 response <200 https://dig.chouti.com/> close爬蟲
16.scrapy_redis
scrapy_redis這個是幹嗎的呢?這個組件是幫咱們開發分佈式爬蟲的組件
redis操做:
1 import redis 2 3 conn = redis.Redis(host='140.143.227.206', port=8888, password='beta') 4 keys = conn.keys() 5 print(keys) 6 print(conn.smembers('dupefilter:xiaodongbei')) 7 # print(conn.smembers('visited_urls')) 8 # v1 = conn.sadd('urls','http://www.baidu.com') 9 # v2 = conn.sadd('urls','http://www.cnblogs.com') 10 # print(v1) 11 # print(v2) 12 # v3 = conn.sadd('urls','http://www.bing.com') 13 # print(v3) 14 15 # result = conn.sadd('urls','http://www.bing.com') 16 # if result == 1: 17 # print('以前未訪問過') 18 # else: 19 # print('以前訪問過') 20 # print(conn.smembers('urls'))
(1).scrapy_redis去重
代碼示例:
方式一:徹底自定義
自定義去重規則:隨便寫一個文件:xxx.py
from scrapy.dupefilter import BaseDupeFilter import redis from scrapy.utils.request import request_fingerprint import scrapy_redis class DupFilter(BaseDupeFilter): def __init__(self): self.conn = redis.Redis(host='140.143.227.206', port=8888, password='beta') def request_seen(self, request): """ 檢測當前請求是否已經被訪問過 :param request: :return: True表示已經訪問過;False表示未訪問過 """ fid = request_fingerprint(request) result = self.conn.sadd('visited_urls', fid) if result == 1: return False return True
setting裏面設置:
DUPEFILTER_CLASS = 'dbd.xxx.DupFilter'
方式二:在scrapy_redis上定製
咱們經過源碼分析:導入:import redis
路徑:/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy_redis/dupefilter.py
RFPDupeFilter類:這個類也繼承了BaseDupeFilter,跟咱們的方式一差很少
1 import logging 2 import time 3 4 from scrapy.dupefilters import BaseDupeFilter 5 from scrapy.utils.request import request_fingerprint 6 7 from . import defaults 8 from .connection import get_redis_from_settings 9 10 11 logger = logging.getLogger(__name__) 12 13 14 # TODO: Rename class to RedisDupeFilter. 15 class RFPDupeFilter(BaseDupeFilter): 16 """Redis-based request duplicates filter. 17 18 This class can also be used with default Scrapy's scheduler. 19 20 """ 21 22 logger = logger 23 24 def __init__(self, server, key, debug=False): 25 """Initialize the duplicates filter. 26 27 Parameters 28 ---------- 29 server : redis.StrictRedis 30 The redis server instance. 31 key : str 32 Redis key Where to store fingerprints. 33 debug : bool, optional 34 Whether to log filtered requests. 35 36 """ 37 self.server = server 38 self.key = key 39 self.debug = debug 40 self.logdupes = True 41 42 @classmethod 43 def from_settings(cls, settings): 44 """Returns an instance from given settings. 45 46 This uses by default the key ``dupefilter:<timestamp>``. When using the 47 ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 48 it needs to pass the spider name in the key. 49 50 Parameters 51 ---------- 52 settings : scrapy.settings.Settings 53 54 Returns 55 ------- 56 RFPDupeFilter 57 A RFPDupeFilter instance. 58 59 60 """ 61 server = get_redis_from_settings(settings) 62 # XXX: This creates one-time key. needed to support to use this 63 # class as standalone dupefilter with scrapy's default scheduler 64 # if scrapy passes spider on open() method this wouldn't be needed 65 # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 66 key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 67 debug = settings.getbool('DUPEFILTER_DEBUG') 68 return cls(server, key=key, debug=debug) 69 70 @classmethod 71 def from_crawler(cls, crawler): 72 """Returns instance from crawler. 73 74 Parameters 75 ---------- 76 crawler : scrapy.crawler.Crawler 77 78 Returns 79 ------- 80 RFPDupeFilter 81 Instance of RFPDupeFilter. 82 83 """ 84 return cls.from_settings(crawler.settings) 85 86 def request_seen(self, request): 87 """Returns True if request was already seen. 88 89 Parameters 90 ---------- 91 request : scrapy.http.Request 92 93 Returns 94 ------- 95 bool 96 97 """ 98 fp = self.request_fingerprint(request) 99 # This returns the number of values added, zero if already exists. 100 added = self.server.sadd(self.key, fp) 101 return added == 0 102 103 def request_fingerprint(self, request): 104 """Returns a fingerprint for a given request. 105 106 Parameters 107 ---------- 108 request : scrapy.http.Request 109 110 Returns 111 ------- 112 str 113 114 """ 115 return request_fingerprint(request) 116 117 def close(self, reason=''): 118 """Delete data on close. Called by Scrapy's scheduler. 119 120 Parameters 121 ---------- 122 reason : str, optional 123 124 """ 125 self.clear() 126 127 def clear(self): 128 """Clears fingerprints data.""" 129 self.server.delete(self.key) 130 131 def log(self, request, spider): 132 """Logs given request. 133 134 Parameters 135 ---------- 136 request : scrapy.http.Request 137 spider : scrapy.spiders.Spider 138 139 """ 140 if self.debug: 141 msg = "Filtered duplicate request: %(request)s" 142 self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 143 elif self.logdupes: 144 msg = ("Filtered duplicate request %(request)s" 145 " - no more duplicates will be shown" 146 " (see DUPEFILTER_DEBUG to show all duplicates)") 147 self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 148 self.logdupes = False
這個類裏面也是經過request_seen來去重的
def request_seen(self, request): """Returns True if request was already seen. Parameters ---------- request : scrapy.http.Request Returns ------- bool """ fp = self.request_fingerprint(request) # This returns the number of values added, zero if already exists. added = self.server.sadd(self.key, fp) return added == 0
說明:
1 1.fp = self.request_fingerprint(request):建立惟一標識 2 2.added = self.server.sadd(self.key, fp): 3 self.server = server:對應的redis連接 4 (1).@classmethod 5 def from_settings(cls, settings): 6 """Returns an instance from given settings. 7 8 This uses by default the key ``dupefilter:<timestamp>``. When using the 9 ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 10 it needs to pass the spider name in the key. 11 12 Parameters 13 ---------- 14 settings : scrapy.settings.Settings 15 16 Returns 17 ------- 18 RFPDupeFilter 19 A RFPDupeFilter instance. 20 21 22 """ 23 server = get_redis_from_settings(settings) 24 # XXX: This creates one-time key. needed to support to use this 25 # class as standalone dupefilter with scrapy's default scheduler 26 # if scrapy passes spider on open() method this wouldn't be needed 27 # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 28 key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 29 debug = settings.getbool('DUPEFILTER_DEBUG') 30 return cls(server, key=key, debug=debug) 31 32 (2).def get_redis_from_settings(settings): 33 """Returns a redis client instance from given Scrapy settings object. 34 35 This function uses ``get_client`` to instantiate the client and uses 36 ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 37 can override them using the ``REDIS_PARAMS`` setting. 38 39 Parameters 40 ---------- 41 settings : Settings 42 A scrapy settings object. See the supported settings below. 43 44 Returns 45 ------- 46 server 47 Redis client instance. 48 49 Other Parameters 50 ---------------- 51 REDIS_URL : str, optional 52 Server connection URL. 53 REDIS_HOST : str, optional 54 Server host. 55 REDIS_PORT : str, optional 56 Server port. 57 REDIS_ENCODING : str, optional 58 Data encoding. 59 REDIS_PARAMS : dict, optional 60 Additional client parameters. 61 62 """ 63 params = defaults.REDIS_PARAMS.copy() 64 params.update(settings.getdict('REDIS_PARAMS')) 65 # XXX: Deprecate REDIS_* settings. 66 for source, dest in SETTINGS_PARAMS_MAP.items(): 67 val = settings.get(source) 68 if val: 69 params[dest] = val 70 71 # Allow ``redis_cls`` to be a path to a class. 72 if isinstance(params.get('redis_cls'), six.string_types): 73 params['redis_cls'] = load_object(params['redis_cls']) 74 75 return get_redis(**params) 76 (3).def get_redis(**kwargs): 77 """Returns a redis client instance. 78 79 Parameters 80 ---------- 81 redis_cls : class, optional 82 Defaults to ``redis.StrictRedis``. 83 url : str, optional 84 If given, ``redis_cls.from_url`` is used to instantiate the class. 85 **kwargs 86 Extra parameters to be passed to the ``redis_cls`` class. 87 88 Returns 89 ------- 90 server 91 Redis client instance. 92 93 """ 94 redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 95 url = kwargs.pop('url', None) 96 if url: 97 return redis_cls.from_url(url, **kwargs) 98 else: 99 return redis_cls(**kwargs) 100 (4).redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 101 REDIS_CLS = redis.StrictRedis 102 103 所以在配置文件setting裏面要加 104 # ############### scrapy redis鏈接 #################### 105 106 REDIS_HOST = '140.143.227.206' # 主機名 107 REDIS_PORT = 8888 # 端口 108 REDIS_PARAMS = {'password':'beta'} # Redis鏈接參數 默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) 109 REDIS_ENCODING = "utf-8" # redis編碼類型 默認:'utf-8' 110 111 # REDIS_URL = 'redis://user:pass@hostname:9001' # 鏈接URL(優先於以上配置) 112 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 113 114 self.key = key:key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 115 3.return added == 0 116 若是爲0的話就表明訪問過了,也就是返回的true,若是範湖爲1的話那麼表明沒有訪問過,也就是返回False
代碼示例:
from scrapy_redis.dupefilter import RFPDupeFilter from scrapy_redis.connection import get_redis_from_settings from scrapy_redis import defaults class RedisDupeFilter(RFPDupeFilter): @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {'timestamp': 'chenjunkan'} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)
這裏面的key默認是用時間戳的形式,而咱們能夠將其寫死key = defaults.DUPEFILTER_KEY % {'timestamp': 'chenjunkan'}
最後在配置文件裏面:
# ############### scrapy redis鏈接 #################### REDIS_HOST = '140.143.227.206' # 主機名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis鏈接參數
默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" # redis編碼類型 默認:'utf-8' # REDIS_URL = 'redis://user:pass@hostname:9001' # 鏈接URL(優先於以上配置) DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' DUPEFILTER_CLASS = 'cjkscrapy.xxx.RedisDupeFilter'
(2).scrapy_redis隊列
redis實現隊列和棧:
查看源碼:導入 import scrapy_redis
路徑:/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy_redis/queue.py
queue.py:
1 from scrapy.utils.reqser import request_to_dict, request_from_dict 2 3 from . import picklecompat 4 5 6 class Base(object): 7 """Per-spider base queue class""" 8 9 def __init__(self, server, spider, key, serializer=None): 10 """Initialize per-spider redis queue. 11 12 Parameters 13 ---------- 14 server : StrictRedis 15 Redis client instance. 16 spider : Spider 17 Scrapy spider instance. 18 key: str 19 Redis key where to put and get messages. 20 serializer : object 21 Serializer object with ``loads`` and ``dumps`` methods. 22 23 """ 24 if serializer is None: 25 # Backward compatibility. 26 # TODO: deprecate pickle. 27 serializer = picklecompat 28 if not hasattr(serializer, 'loads'): 29 raise TypeError("serializer does not implement 'loads' function: %r" 30 % serializer) 31 if not hasattr(serializer, 'dumps'): 32 raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 33 % serializer) 34 35 self.server = server 36 self.spider = spider 37 self.key = key % {'spider': spider.name} 38 self.serializer = serializer 39 40 def _encode_request(self, request): 41 """Encode a request object""" 42 obj = request_to_dict(request, self.spider) 43 return self.serializer.dumps(obj) 44 45 def _decode_request(self, encoded_request): 46 """Decode an request previously encoded""" 47 obj = self.serializer.loads(encoded_request) 48 return request_from_dict(obj, self.spider) 49 50 def __len__(self): 51 """Return the length of the queue""" 52 raise NotImplementedError 53 54 def push(self, request): 55 """Push a request""" 56 raise NotImplementedError 57 58 def pop(self, timeout=0): 59 """Pop a request""" 60 raise NotImplementedError 61 62 def clear(self): 63 """Clear queue/stack""" 64 self.server.delete(self.key) 65 66 67 class FifoQueue(Base): 68 """Per-spider FIFO queue""" 69 70 def __len__(self): 71 """Return the length of the queue""" 72 return self.server.llen(self.key) 73 74 def push(self, request): 75 """Push a request""" 76 self.server.lpush(self.key, self._encode_request(request)) 77 78 def pop(self, timeout=0): 79 """Pop a request""" 80 if timeout > 0: 81 data = self.server.brpop(self.key, timeout) 82 if isinstance(data, tuple): 83 data = data[1] 84 else: 85 data = self.server.rpop(self.key) 86 if data: 87 return self._decode_request(data) 88 89 90 class PriorityQueue(Base): 91 """Per-spider priority queue abstraction using redis' sorted set""" 92 93 def __len__(self): 94 """Return the length of the queue""" 95 return self.server.zcard(self.key) 96 97 def push(self, request): 98 """Push a request""" 99 data = self._encode_request(request) 100 score = -request.priority 101 # We don't use zadd method as the order of arguments change depending on 102 # whether the class is Redis or StrictRedis, and the option of using 103 # kwargs only accepts strings, not bytes. 104 self.server.execute_command('ZADD', self.key, score, data) 105 106 def pop(self, timeout=0): 107 """ 108 Pop a request 109 timeout not support in this queue class 110 """ 111 # use atomic range/remove using multi/exec 112 pipe = self.server.pipeline() 113 pipe.multi() 114 pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 115 results, count = pipe.execute() 116 if results: 117 return self._decode_request(results[0]) 118 119 120 class LifoQueue(Base): 121 """Per-spider LIFO queue.""" 122 123 def __len__(self): 124 """Return the length of the stack""" 125 return self.server.llen(self.key) 126 127 def push(self, request): 128 """Push a request""" 129 self.server.lpush(self.key, self._encode_request(request)) 130 131 def pop(self, timeout=0): 132 """Pop a request""" 133 if timeout > 0: 134 data = self.server.blpop(self.key, timeout) 135 if isinstance(data, tuple): 136 data = data[1] 137 else: 138 data = self.server.lpop(self.key) 139 140 if data: 141 return self._decode_request(data) 142 143 144 # TODO: Deprecate the use of these names. 145 SpiderQueue = FifoQueue 146 SpiderStack = LifoQueue 147 SpiderPriorityQueue = PriorityQueue
隊列:先進先出
代碼示例:
import scrapy_redis import redis class FifoQueue(object): def __init__(self): self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta') def push(self, request): """Push a request""" self.server.lpush('USERS', request) def pop(self, timeout=0): """Pop a request""" data = self.server.rpop('USERS') return data # [33,22,11] q = FifoQueue() q.push(11) q.push(22) q.push(33) print(q.pop()) print(q.pop()) print(q.pop())
說明:lpush從左邊添加,rpop從右邊拿出來,也就是先進先出,廣度優先
棧:後進先出:
import redis class LifoQueue(object): """Per-spider LIFO queue.""" def __init__(self): self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta') def push(self, request): """Push a request""" self.server.lpush("USERS", request) def pop(self, timeout=0): """Pop a request""" data = self.server.lpop('USERS') return # [33,22,11]
說明:lpush從左邊進去,lpop從左邊出去,後進先出,深度優先
zadd,zrange函數:
import redis conn = redis.Redis(host='140.143.227.206', port=8888, password='beta') conn.zadd('score', cjk=79, pyy=33, cc=73) print(conn.keys()) v = conn.zrange('score', 0, 8, desc=True) print(v) pipe = conn.pipeline()# 打包,一次性執行多條命令 pipe.multi() pipe.zrange("score", 0, 0).zremrangebyrank('score', 0, 0)# 表示默認從小到大排序,只取第一個;根據排名刪除第一個 results, count = pipe.execute() print(results, count)
zadd函數:將 cjk=79, pyy=33, cc=73三個值放入redis裏面去,名字爲score
zrange函數:能夠給score設置一個區間,讓他按照從小到大進行排序;v = conn.zrange('score', 0, 8, desc=True)若是將desc設置爲true表示按照分值從大到小排取
優先級隊列:
import redis class PriorityQueue(object): """Per-spider priority queue abstraction using redis' sorted set""" def __init__(self): self.server = redis.Redis(host='140.143.227.206', port=8888, password='beta') def push(self, request, score): """Push a request""" # data = self._encode_request(request) # score = -request.priority # We don't use zadd method as the order of arguments change depending on # whether the class is Redis or StrictRedis, and the option of using # kwargs only accepts strings, not bytes. self.server.execute_command('ZADD', 'xxxxxx', score, request) def pop(self, timeout=0): """ Pop a request timeout not support in this queue class """ # use atomic range/remove using multi/exec pipe = self.server.pipeline() pipe.multi() pipe.zrange('xxxxxx', 0, 0).zremrangebyrank('xxxxxx', 0, 0) results, count = pipe.execute() if results: return results[0] q = PriorityQueue() q.push('alex', 99) q.push('oldboy', 56) q.push('eric', 77) v1 = q.pop() print(v1) v2 = q.pop() print(v2) v3 = q.pop() print(v3)
說明:若是分數同樣的話,那麼就根據名字來排優先級,若是按照優先級從小到大就是廣度優先,若是從大到小就是深度優先
(3)scrapy_redis調度器
調度器的配置文件:
# ###################### 調度器 ###################### from scrapy_redis.scheduler import Scheduler # 由scrapy_redis的調度器來進行負責調配 # enqueue_request: 向調度器中添加任務 # next_request: 去調度器中獲取一個任務 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 規定任務存放的順序 # 優先級 DEPTH_PRIORITY = 1 # 廣度優先 # DEPTH_PRIORITY = -1 # 深度優先 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默認使用優先級隊列(默認),其餘:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) # 廣度優先 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' # 默認使用優先級隊列(默認),其餘:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) # 深度優先 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 默認使用優先級隊列(默認),其餘:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) """ redis = { chouti:requests:[ pickle.dumps(Request(url='Http://wwwww',callback=self.parse)), pickle.dumps(Request(url='Http://wwwww',callback=self.parse)), pickle.dumps(Request(url='Http://wwwww',callback=self.parse)), ], cnblogs:requests:[ ] } """ SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 調度器中請求存放在redis中的key SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 對保存到redis中的數據進行序列化,默認使用pickle SCHEDULER_PERSIST = False # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空 SCHEDULER_FLUSH_ON_START = True # 是否在開始以前清空 調度器和去重記錄,True=清空,False=不清空 # SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去調度器中獲取數據時,若是爲空,最多等待時間(最後沒數據,未獲取到)。 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重規則,在redis中保存時對應的key SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重規則對應處理的類
說明:
a.from scrapy_redis.scheduler import Scheduler:表示由scrapy的調度器來調度
1 import importlib 2 import six 3 4 from scrapy.utils.misc import load_object 5 6 from . import connection, defaults 7 8 9 # TODO: add SCRAPY_JOB support. 10 class Scheduler(object): 11 """Redis-based scheduler 12 13 Settings 14 -------- 15 SCHEDULER_PERSIST : bool (default: False) 16 Whether to persist or clear redis queue. 17 SCHEDULER_FLUSH_ON_START : bool (default: False) 18 Whether to flush redis queue on start. 19 SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 20 How many seconds to wait before closing if no message is received. 21 SCHEDULER_QUEUE_KEY : str 22 Scheduler redis key. 23 SCHEDULER_QUEUE_CLASS : str 24 Scheduler queue class. 25 SCHEDULER_DUPEFILTER_KEY : str 26 Scheduler dupefilter redis key. 27 SCHEDULER_DUPEFILTER_CLASS : str 28 Scheduler dupefilter class. 29 SCHEDULER_SERIALIZER : str 30 Scheduler serializer. 31 32 """ 33 34 def __init__(self, server, 35 persist=False, 36 flush_on_start=False, 37 queue_key=defaults.SCHEDULER_QUEUE_KEY, 38 queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 39 dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 40 dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 41 idle_before_close=0, 42 serializer=None): 43 """Initialize scheduler. 44 45 Parameters 46 ---------- 47 server : Redis 48 The redis server instance. 49 persist : bool 50 Whether to flush requests when closing. Default is False. 51 flush_on_start : bool 52 Whether to flush requests on start. Default is False. 53 queue_key : str 54 Requests queue key. 55 queue_cls : str 56 Importable path to the queue class. 57 dupefilter_key : str 58 Duplicates filter key. 59 dupefilter_cls : str 60 Importable path to the dupefilter class. 61 idle_before_close : int 62 Timeout before giving up. 63 64 """ 65 if idle_before_close < 0: 66 raise TypeError("idle_before_close cannot be negative") 67 68 self.server = server 69 self.persist = persist 70 self.flush_on_start = flush_on_start 71 self.queue_key = queue_key 72 self.queue_cls = queue_cls 73 self.dupefilter_cls = dupefilter_cls 74 self.dupefilter_key = dupefilter_key 75 self.idle_before_close = idle_before_close 76 self.serializer = serializer 77 self.stats = None 78 79 def __len__(self): 80 return len(self.queue) 81 82 @classmethod 83 def from_settings(cls, settings): 84 kwargs = { 85 'persist': settings.getbool('SCHEDULER_PERSIST'), 86 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 87 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 88 } 89 90 # If these values are missing, it means we want to use the defaults. 91 optional = { 92 # TODO: Use custom prefixes for this settings to note that are 93 # specific to scrapy-redis. 94 'queue_key': 'SCHEDULER_QUEUE_KEY', 95 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 96 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 97 # We use the default setting name to keep compatibility. 98 'dupefilter_cls': 'DUPEFILTER_CLASS', 99 'serializer': 'SCHEDULER_SERIALIZER', 100 } 101 for name, setting_name in optional.items(): 102 val = settings.get(setting_name) 103 if val: 104 kwargs[name] = val 105 106 # Support serializer as a path to a module. 107 if isinstance(kwargs.get('serializer'), six.string_types): 108 kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 109 110 server = connection.from_settings(settings) 111 # Ensure the connection is working. 112 server.ping() 113 114 return cls(server=server, **kwargs) 115 116 @classmethod 117 def from_crawler(cls, crawler): 118 instance = cls.from_settings(crawler.settings) 119 # FIXME: for now, stats are only supported from this constructor 120 instance.stats = crawler.stats 121 return instance 122 123 def open(self, spider): 124 self.spider = spider 125 126 try: 127 self.queue = load_object(self.queue_cls)( 128 server=self.server, 129 spider=spider, 130 key=self.queue_key % {'spider': spider.name}, 131 serializer=self.serializer, 132 ) 133 except TypeError as e: 134 raise ValueError("Failed to instantiate queue class '%s': %s", 135 self.queue_cls, e) 136 137 try: 138 self.df = load_object(self.dupefilter_cls)( 139 server=self.server, 140 key=self.dupefilter_key % {'spider': spider.name}, 141 debug=spider.settings.getbool('DUPEFILTER_DEBUG'), 142 ) 143 except TypeError as e: 144 raise ValueError("Failed to instantiate dupefilter class '%s': %s", 145 self.dupefilter_cls, e) 146 147 if self.flush_on_start: 148 self.flush() 149 # notice if there are requests already in the queue to resume the crawl 150 if len(self.queue): 151 spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 152 153 def close(self, reason): 154 if not self.persist: 155 self.flush() 156 157 def flush(self): 158 self.df.clear() 159 self.queue.clear() 160 161 def enqueue_request(self, request): 162 if not request.dont_filter and self.df.request_seen(request): 163 self.df.log(request, self.spider) 164 return False 165 if self.stats: 166 self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 167 self.queue.push(request) 168 return True 169 170 def next_request(self): 171 block_pop_timeout = self.idle_before_close 172 request = self.queue.pop(block_pop_timeout) 173 if request and self.stats: 174 self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 175 return request 176 177 def has_pending_requests(self): 178 return len(self) > 0
在這個類裏面比較重要的方法:enqueue_request和next_request:
當咱們的爬蟲程序運行起來的時候,只有一個調度器,調用enqueue_request往隊列裏面加入一個請求,添加一個任務的時候就是調用enqueue_request,若是下載器來去任務就會調用next_request,所以咱們確定會調用隊列;
b.SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue':默認使用的是優先級隊列
隊列其實就是redis裏面的那個列表,這個列表應該會有一個key:SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 調度器中請求存放在redis中的key,取得是當前爬蟲的名稱;由於redis隊列裏面只能放字符串,那麼咱們須要經過SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" 進行序列化# 對保存到redis中的數據進行序列化,默認使用pickle
1 """A pickle wrapper module with protocol=-1 by default.""" 2 3 try: 4 import cPickle as pickle # PY2 5 except ImportError: 6 import pickle 7 8 9 def loads(s): 10 return pickle.loads(s) 11 12 13 def dumps(obj): 14 return pickle.dumps(obj, protocol=-1)
c.SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去調度器中獲取數據時,若是爲空,最多等待時間(最後沒數據,未獲取到)。表示阻塞的去取任務
import redis conn = redis.Redis(host='140.143.227.206',port=8888,password='beta') # conn.flushall() print(conn.keys()) # chouti:dupefilter/chouti:request # conn.lpush('xxx:request','http://wwww.xxx.com') # conn.lpush('xxx:request','http://wwww.xxx1.com') # print(conn.lpop('xxx:request')) # print(conn.blpop('xxx:request',timeout=10))
整個的執行流程:
1. 當執行scrapy crawl chouti --nolog 2. 找到 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 配置並實例化調度器對象 - 執行Scheduler.from_crawler @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) # FIXME: for now, stats are only supported from this constructor instance.stats = crawler.stats return instance - 執行Scheduler.from_settings @classmethod def from_settings(cls, settings): kwargs = { 'persist': settings.getbool('SCHEDULER_PERSIST'), 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), } # If these values are missing, it means we want to use the defaults. optional = { # TODO: Use custom prefixes for this settings to note that are # specific to scrapy-redis. 'queue_key': 'SCHEDULER_QUEUE_KEY', 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', # We use the default setting name to keep compatibility. 'dupefilter_cls': 'DUPEFILTER_CLASS', 'serializer': 'SCHEDULER_SERIALIZER', } for name, setting_name in optional.items(): val = settings.get(setting_name) if val: kwargs[name] = val # Support serializer as a path to a module. if isinstance(kwargs.get('serializer'), six.string_types): kwargs['serializer'] = importlib.import_module(kwargs['serializer']) server = connection.from_settings(settings) # Ensure the connection is working. server.ping() return cls(server=server, **kwargs) - 讀取配置文件: SCHEDULER_PERSIST # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空 SCHEDULER_FLUSH_ON_START # 是否在開始以前清空 調度器和去重記錄,True=清空,False=不清空 SCHEDULER_IDLE_BEFORE_CLOSE # 去調度器中獲取數據時,若是爲空,最多等待時間(最後沒數據,未獲取到)。 - 讀取配置文件: SCHEDULER_QUEUE_KEY # %(spider)s:requests SCHEDULER_QUEUE_CLASS # scrapy_redis.queue.FifoQueue SCHEDULER_DUPEFILTER_KEY # '%(spider)s:dupefilter' DUPEFILTER_CLASS # 'scrapy_redis.dupefilter.RFPDupeFilter' SCHEDULER_SERIALIZER # "scrapy_redis.picklecompat" - 讀取配置文件: REDIS_HOST = '140.143.227.206' # 主機名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis鏈接參數 默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" - 實例化Scheduler對象 3. 爬蟲開始執行起始URL - 調用 scheduler.enqueue_requests() def enqueue_request(self, request): # 請求是否須要過濾? # 去重規則中是否已經有?(是否已經訪問過,若是未訪問添加到去重記錄中。) if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) # 已經訪問過就不要再訪問了 return False if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) # print('未訪問過,添加到調度器', request) self.queue.push(request) return True 4. 下載器去調度器中獲取任務,去下載 - 調用 scheduler.next_requests() def next_request(self): block_pop_timeout = self.idle_before_close request = self.queue.pop(block_pop_timeout) if request and self.stats: self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) return request
深度優先和廣度優先的第二種方式:
# 規定任務存放的順序 # 優先級 DEPTH_PRIORITY = 1 # 廣度優先 # DEPTH_PRIORITY = -1 # 深度優先
說明:當爬蟲爬取的時候咱們的優先級設置的是越往深處越低request.priority -= depth * self.prio,當咱們設置DEPTH_PRIORITY = 1的時候,也就是越往深度優先級愈來愈小,而後在咱們的scrapy_redis裏面score = -request.priority,這裏面的分數就會變得愈來愈大,表示從小到大,說明是廣度優先,反過來就是深度優先
scrapy中 調度器 和 隊列 和 dupefilter的關係?
調度器,調配添加或獲取那個request.
隊列,存放request。
dupefilter,訪問記錄。
注意:
# 優先使用DUPEFILTER_CLASS,若是沒有就是用SCHEDULER_DUPEFILTER_CLASS SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重規則對應處理的類
(4).redis.spider
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request import scrapy_redis from scrapy_redis.spiders import RedisSpider class ChoutiSpider(RedisSpider): name = 'chouti' allowed_domains = ['chouti.com']def parse(self, response): print(response)
說明:若是咱們from scrapy_redis.spiders import RedisSpider,讓咱們的爬蟲繼承RedisSpider
1 class RedisSpider(RedisMixin, Spider): 2 """Spider that reads urls from redis queue when idle. 3 4 Attributes 5 ---------- 6 redis_key : str (default: REDIS_START_URLS_KEY) 7 Redis key where to fetch start URLs from.. 8 redis_batch_size : int (default: CONCURRENT_REQUESTS) 9 Number of messages to fetch from redis on each attempt. 10 redis_encoding : str (default: REDIS_ENCODING) 11 Encoding to use when decoding messages from redis queue. 12 13 Settings 14 -------- 15 REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls") 16 Default Redis key where to fetch start URLs from.. 17 REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 18 Default number of messages to fetch from redis on each attempt. 19 REDIS_START_URLS_AS_SET : bool (default: False) 20 Use SET operations to retrieve messages from the redis queue. If False, 21 the messages are retrieve using the LPOP command. 22 REDIS_ENCODING : str (default: "utf-8") 23 Default encoding to use when decoding messages from redis queue. 24 25 """ 26 27 @classmethod 28 def from_crawler(self, crawler, *args, **kwargs): 29 obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 30 obj.setup_redis(crawler) 31 return obj
能夠控制起始的url是去列表裏面取仍是去集合裏面取
fetch_one = self.server.spop if use_set else self.server.lpop
setting配置文件裏面:
REDIS_START_URLS_AS_SET = False
import redis conn = redis.Redis(host='140.143.227.206',port=8888,password='beta') conn.lpush('chouti:start_urls','https://dig.chouti.com/r/pic/hot/1') #能夠用來控制來一個連接就去執行一個連接,定製起始的url,能夠源源不斷的去發請求
(5).setting配置文件
1.BOT_NAME = 'cjk' # 表示爬蟲名稱 2.SPIDER_MODULES = ['cjk.spiders']# 爬蟲所放在的目錄 3.NEWSPIDER_MODULE = 'cjk.spiders' #新建立爬蟲的時候爬蟲所在的目錄 4.USER_AGENT = 'dbd (+http://www.yourdomain.com)' #請求頭 5.ROBOTSTXT_OBEY = False #若是是true的時候表示遵循別人的爬蟲協議,False就表示 6.CONCURRENT_REQUESTS = 32# 全部的爬蟲併發請求數,假設有兩個爬蟲,有可能一個爬蟲15個,另一個17個 7.DOWNLOAD_DELAY = 3#表示每次去下載頁面的時候去延遲3秒 8.CONCURRENT_REQUESTS_PER_DOMAIN = 16 #每一域名併發16個,或者換句話說每個爬蟲16個 9.CONCURRENT_REQUESTS_PER_IP = 16# 有時候咱們一個域名有可能會有多個ip假設爲2個ip,那麼總併發數爲16*2=32個 10.COOKIES_ENABLED = False# 表示內部幫咱們設置cookie 11. TELNETCONSOLE_ENABLED = True,#表示若是爬蟲正在運行的時候可讓他終止,而後又從新運行 #TELNETCONSOLE_HOST = '127.0.0.1' # TELNETCONSOLE_PORT = [6023,] 12.DEFAULT_REQUEST_HEADERS# 默認請求頭 13.#AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False from scrapy.contrib.throttle import AutoThrottle """ 自動限速算法 from scrapy.contrib.throttle import AutoThrottle 自動限速設置 1. 獲取最小延遲 DOWNLOAD_DELAY 2. 獲取最大延遲 AUTOTHROTTLE_MAX_DELAY 3. 設置初始下載延遲 AUTOTHROTTLE_START_DELAY 4. 當請求下載完成後,獲取其"鏈接"時間 latency,即:請求鏈接到接受到響應頭之間的時間 5. 用於計算的... AUTOTHROTTLE_TARGET_CONCURRENCY target_delay = latency / self.target_concurrency new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延遲時間 new_delay = max(target_delay, new_delay) new_delay = min(max(self.mindelay, new_delay), self.maxdelay) slot.delay = new_delay """ 14.# HTTPCACHE_ENABLED = True # 是否啓用緩存 # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #好比說咱們在地鐵上沒有網,那麼咱們在出去以前咱們能夠將咱們的網頁下載到本地,而後去本地去拿,這樣在咱們本地就會出現一個叫httpcache的文件