scrapy以及redis簡單應用

時間 2019-12-19

原文原文鏈接

PART 1 REDIS簡介

redis是一個key-value存儲系統。和Memcached相似，它支持存儲的value類型相對更多，包括string(字符串)、list(鏈表)、set(集合)、zset(sorted set --有序集合)和hash（哈希類型）。這些數據類型都支持push/pop、add/remove及取交集並集和差集及更豐富的操做，並且這些操做都是原子性的。在此基礎上，redis支持各類不一樣方式的排序。與memcached同樣，爲了保證效率，數據都是緩存在內存中。區別的是redis會週期性的把更新的數據寫入磁盤或者把修改操做寫入追加的記錄文件，而且在此基礎上實現了master-slave(主從)同步。html

參考博客https://www.cnblogs.com/wupeiqi/articles/5132791.htmlpython

PART2 基於redis 啓動爬蟲

settings設置react

# ############ 鏈接redis 信息 #################
REDIS_HOST = '127.0.0.1'                            # 主機名
REDIS_PORT = 6379                                   # 端口
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 鏈接URL（優先於以上配置）
REDIS_PARAMS  = {}                                  # Redis鏈接參數             默認：REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}）
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定鏈接Redis的Python模塊  默認：redis.StrictRedis
REDIS_ENCODING = "utf-8"

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"  #redis去重
# 有引擎來執行：自定義調度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  # 默認使用優先級隊列（默認廣度優先），其餘：PriorityQueue（有序集合），FifoQueue（列表）、LifoQueue（列表）
SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 調度器中請求存放在redis中的key
SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 對保存到redis中的數據進行序列化，默認使用pickle
SCHEDULER_PERSIST = False  # 是否在關閉時候保留原來的調度器和去重記錄，True=保留，False=清空
SCHEDULER_FLUSH_ON_START = True  # 是否在開始以前清空 調度器和去重記錄，True=清空，False=不清空
SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去調度器中獲取數據時，若是爲空，最多等待時間（最後沒數據，未獲取到）。
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重規則，在redis中保存時對應的key  chouti:dupefilter
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重規則對應處理的類
DUPEFILTER_DEBUG = False


# 深度和優先級相關
DEPTH_PRIORITY = 1
#
REDIS_START_URLS_BATCH_SIZE = 1
# # REDIS_START_URLS_AS_SET = True # 把起始url放到redis的集合
REDIS_START_URLS_AS_SET = False # 把起始url放到redis的列表

# -*- coding: utf-8 -*-
from scrapy_redis.spiders import RedisSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from ..items import BigfileItem


class Spider1Spider(RedisSpider):
    name = 'spider1'
    redis_key = 'chouti:start_urls'
    allowed_domains = ['chouti.com']
    
    def parse(self, response):
        print(response)
        req = Request(
            url='https://dig.chouti.com/login',
            method='POST',
            headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                     'referer': 'https://dig.chouti.com/'},
            body='phone=************&password=************&oneMonth=1',
            meta={"cookiejar": True},
            callback=self.check_login,
        )
        yield req

寫一個start.py文件鏈接redis服務，加入初始URLweb

import redis

conn = redis.Redis(host='127.0.0.1', port=6379)

# 起始url的Key： chouti:start_urls
conn.lpush("chouti:start_urls", 'https://dig.chouti.com')

PART3 大文件下載

itemsredis

import scrapy
class BigfileItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url = scrapy.Field()
    type = scrapy.Field()
    file_name = scrapy.Field()

piplines，一點一點下載，基於源碼方法緩存

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from twisted.web.client import Agent, getPage, ResponseDone, PotentialDataLoss
from twisted.python import log, failure, components
from twisted.internet import defer, reactor, protocol
from twisted.internet import interfaces, error
from .middlewares import to_bytes

from twisted.web._newclient import Response
from io import BytesIO
connectionDone = failure.Failure(error.ConnectionDone())
connectionDone.cleanFailure()
class _ResponseReader(protocol.Protocol):

    def __init__(self, finished, txresponse, file_name):
        self._finished = finished
        self._txresponse = txresponse
        self._bytes_received = 0
        self.f = open(file_name, mode='wb')

    def dataReceived(self, bodyBytes):
        self._bytes_received += len(bodyBytes)

        # 一點一點的下載
        self.f.write(bodyBytes)

        self.f.flush()

    def connectionLost(self, reason=connectionDone):
        if self._finished.called:
            return
        if reason.check(ResponseDone):
            # 下載完成
            self._finished.callback((self._txresponse, 'success'))
        elif reason.check(PotentialDataLoss):
            # 下載部分
            self._finished.callback((self._txresponse, 'partial'))
        else:
            # 下載異常
            self._finished.errback(reason)

        self.f.close()


class BigfilePipeline(object):
    def process_item(self, item, spider):
        # 建立一個下載文件的任務
        """
        url 必須加http或https前綴，否則會報錯
        """
        if item['type'] == 'file':
            agent = Agent(reactor)
            print("開始下載....")
            d = agent.request(
                method=b'GET',
                uri=bytes(item['url'], encoding='ascii')
            )
            # 當文件開始下載以後，自動執行 self._cb_bodyready 方法
            d.addCallback(self._cb_bodyready, file_name=item['file_name'])

            return d
        else:
            return item

    def _cb_bodyready(self, txresponse, file_name):
        # 建立 Deferred 對象，控制直到下載完成後，再關閉連接
        d = defer.Deferred()
        d.addBoth(self.download_result)  # 下載完成/異常/錯誤以後執行的回調函數
        txresponse.deliverBody(_ResponseReader(d, txresponse, file_name))
        return d

    def download_result(self, response):
        pass

PART4 下載中間件添加代理

方法一：cookie

start_requests中設置代理加入環境變量app

os.environ['HTTP_PROXY'] = "http://192.168.11.11」

方法二：
重寫下載中間件

import random
import base64
import six


def to_bytes(text, encoding=None, errors='strict'):
    """Return the binary representation of `text`. If `text`
    is already a bytes object, return it as-is."""
    if isinstance(text, bytes):
        return text
    if not isinstance(text, six.string_types):
        raise TypeError('to_bytes must receive a unicode, str or bytes '
                        'object, got %s' % type(text).__name__)
    if encoding is None:
        encoding = 'utf-8'
    return text.encode(encoding, errors)


class MyProxyDownloaderMiddleware(object):
    def process_request(self, request, spider):
        proxy_list = [
            {'ip_port': '111.11.228.75:80', 'user_pass': ''},
            {'ip_port': '120.198.243.22:80', 'user_pass': ''},
            {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
            {'ip_port': '101.71.27.120:80', 'user_pass': ''},
            {'ip_port': '122.96.59.104:80', 'user_pass': ''},
        ]
        proxy = random.choice(proxy_list)
        if proxy['user_pass'] is not None:
            request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
            encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
            request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
        else:
            request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])