redis是一個key-value存儲系統。和Memcached相似,它支持存儲的value類型相對更多,包括string(字符串)、list(鏈表)、set(集合)、zset(sorted set --有序集合)和hash(哈希類型)。這些數據類型都支持push/pop、add/remove及取交集並集和差集及更豐富的操做,並且這些操做都是原子性的。在此基礎上,redis支持各類不一樣方式的排序。與memcached同樣,爲了保證效率,數據都是緩存在內存中。區別的是redis會週期性的把更新的數據寫入磁盤或者把修改操做寫入追加的記錄文件,而且在此基礎上實現了master-slave(主從)同步。html
參考博客https://www.cnblogs.com/wupeiqi/articles/5132791.htmlpython
settings設置react
# ############ 鏈接redis 信息 ################# REDIS_HOST = '127.0.0.1' # 主機名 REDIS_PORT = 6379 # 端口 # REDIS_URL = 'redis://user:pass@hostname:9001' # 鏈接URL(優先於以上配置) REDIS_PARAMS = {} # Redis鏈接參數 默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定鏈接Redis的Python模塊 默認:redis.StrictRedis REDIS_ENCODING = "utf-8" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #redis去重 # 有引擎來執行:自定義調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默認使用優先級隊列(默認廣度優先),其餘:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 調度器中請求存放在redis中的key SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 對保存到redis中的數據進行序列化,默認使用pickle SCHEDULER_PERSIST = False # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空 SCHEDULER_FLUSH_ON_START = True # 是否在開始以前清空 調度器和去重記錄,True=清空,False=不清空 SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去調度器中獲取數據時,若是爲空,最多等待時間(最後沒數據,未獲取到)。 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重規則,在redis中保存時對應的key chouti:dupefilter SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重規則對應處理的類 DUPEFILTER_DEBUG = False # 深度和優先級相關 DEPTH_PRIORITY = 1 # REDIS_START_URLS_BATCH_SIZE = 1 # # REDIS_START_URLS_AS_SET = True # 把起始url放到redis的集合 REDIS_START_URLS_AS_SET = False # 把起始url放到redis的列表
# -*- coding: utf-8 -*- from scrapy_redis.spiders import RedisSpider from scrapy.selector import HtmlXPathSelector from scrapy.http.request import Request from ..items import BigfileItem class Spider1Spider(RedisSpider): name = 'spider1' redis_key = 'chouti:start_urls' allowed_domains = ['chouti.com'] def parse(self, response): print(response) req = Request( url='https://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'referer': 'https://dig.chouti.com/'}, body='phone=************&password=************&oneMonth=1', meta={"cookiejar": True}, callback=self.check_login, ) yield req
寫一個start.py文件鏈接redis服務,加入初始URLweb
import redis conn = redis.Redis(host='127.0.0.1', port=6379) # 起始url的Key: chouti:start_urls conn.lpush("chouti:start_urls", 'https://dig.chouti.com')
itemsredis
import scrapy class BigfileItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() url = scrapy.Field() type = scrapy.Field() file_name = scrapy.Field()
piplines,一點一點下載,基於源碼方法緩存
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from twisted.web.client import Agent, getPage, ResponseDone, PotentialDataLoss from twisted.python import log, failure, components from twisted.internet import defer, reactor, protocol from twisted.internet import interfaces, error from .middlewares import to_bytes from twisted.web._newclient import Response from io import BytesIO connectionDone = failure.Failure(error.ConnectionDone()) connectionDone.cleanFailure() class _ResponseReader(protocol.Protocol): def __init__(self, finished, txresponse, file_name): self._finished = finished self._txresponse = txresponse self._bytes_received = 0 self.f = open(file_name, mode='wb') def dataReceived(self, bodyBytes): self._bytes_received += len(bodyBytes) # 一點一點的下載 self.f.write(bodyBytes) self.f.flush() def connectionLost(self, reason=connectionDone): if self._finished.called: return if reason.check(ResponseDone): # 下載完成 self._finished.callback((self._txresponse, 'success')) elif reason.check(PotentialDataLoss): # 下載部分 self._finished.callback((self._txresponse, 'partial')) else: # 下載異常 self._finished.errback(reason) self.f.close() class BigfilePipeline(object): def process_item(self, item, spider): # 建立一個下載文件的任務 """ url 必須加http或https前綴,否則會報錯 """ if item['type'] == 'file': agent = Agent(reactor) print("開始下載....") d = agent.request( method=b'GET', uri=bytes(item['url'], encoding='ascii') ) # 當文件開始下載以後,自動執行 self._cb_bodyready 方法 d.addCallback(self._cb_bodyready, file_name=item['file_name']) return d else: return item def _cb_bodyready(self, txresponse, file_name): # 建立 Deferred 對象,控制直到下載完成後,再關閉連接 d = defer.Deferred() d.addBoth(self.download_result) # 下載完成/異常/錯誤以後執行的回調函數 txresponse.deliverBody(_ResponseReader(d, txresponse, file_name)) return d def download_result(self, response): pass
方法一:cookie
start_requests中設置代理加入環境變量app
os.environ['HTTP_PROXY'] = "http://192.168.11.11」
方法二:
重寫下載中間件
import random import base64 import six def to_bytes(text, encoding=None, errors='strict'): """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): return text if not isinstance(text, six.string_types): raise TypeError('to_bytes must receive a unicode, str or bytes ' 'object, got %s' % type(text).__name__) if encoding is None: encoding = 'utf-8' return text.encode(encoding, errors) class MyProxyDownloaderMiddleware(object): def process_request(self, request, spider): proxy_list = [ {'ip_port': '111.11.228.75:80', 'user_pass': ''}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, ] proxy = random.choice(proxy_list) if proxy['user_pass'] is not None: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass'])) request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) else: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])