8.scrapy配置文件

scrapy配置文件

1. settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for step8_king project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

# 1. 爬蟲名稱
BOT_NAME = 'step8_king'

# 2. 爬蟲應用路徑
SPIDER_MODULES = ['step8_king.spiders']
NEWSPIDER_MODULE = 'step8_king.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 3. 客戶端 user-agent請求頭
# USER_AGENT = 'step8_king (+http://www.yourdomain.com)'

# Obey robots.txt rules
# 4. 禁止爬蟲配置
# ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 5. 併發請求數
# CONCURRENT_REQUESTS = 4

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 6. 延遲下載秒數
# DOWNLOAD_DELAY = 2


# The download delay setting will honor only one of:
# 7. 單域名訪問併發數,而且延遲下次秒數也應用在每一個域名
# CONCURRENT_REQUESTS_PER_DOMAIN = 2
# 單IP訪問併發數,若是有值則忽略:CONCURRENT_REQUESTS_PER_DOMAIN,而且延遲下次秒數也應用在每一個IP
# CONCURRENT_REQUESTS_PER_IP = 3


# Disable cookies (enabled by default)
# 8. 是否支持cookie,cookiejar進行操做cookie
# COOKIES_ENABLED = True
# COOKIES_DEBUG = True


# Disable Telnet Console (enabled by default)
# 9. Telnet用於查看當前爬蟲的信息,操做爬蟲等...
#    使用telnet ip port ,而後經過命令操做
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]
# 命令est()



# 10. 默認請求頭(優先級低於request對象中的請求頭)
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#     'Accept-Language': 'en',
# }



# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# 11. 定義pipeline處理請求 值越小優先級越高 0-1000
# ITEM_PIPELINES = {
#    'step8_king.pipelines.JsonPipeline': 700,
#    'step8_king.pipelines.FilePipeline': 500,
# }




# 12. 自定義擴展,基於信號進行調用
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#     # 'step8_king.extensions.MyExtension': 500,
# }




# 13. 爬蟲容許的最大深度,能夠經過meta查看當前深度;0表示無深度
# DEPTH_LIMIT = 3



# 14. 爬取時,0表示深度優先Lifo(默認);1表示廣度優先FiFo



# 後進先出,深度優先
# DEPTH_PRIORITY = 0
# 基於硬盤的 DISK
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# 基於內存的 MEMORY
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'


# 先進先出,廣度優先
# DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'

# 15. 調度器隊列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'  這是一個類
# from scrapy.core.scheduler import Scheduler


# 16. 訪問URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'


# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

"""
18. 啓用緩存  通常不太用
    目的用於將已經發送的請求或相應緩存下來,以便之後使用,
    
    from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
    from scrapy.extensions.httpcache import DummyPolicy
    from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否啓用緩存策略
# HTTPCACHE_ENABLED = True

# 緩存策略:全部請求均緩存,下次在請求直接訪問原來的緩存便可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 緩存策略:根據Http響應頭:Cache-Control、Last-Modified 等進行緩存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"

# 緩存超時時間
# HTTPCACHE_EXPIRATION_SECS = 0

# 緩存保存路徑
# HTTPCACHE_DIR = 'httpcache'

# 緩存忽略的Http狀態碼
# HTTPCACHE_IGNORE_HTTP_CODES = []

# 緩存存儲的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

2. 自動限速算法

"""
17. 自動限速算法
    from scrapy.contrib.throttle import AutoThrottle
    自動限速設置
    1. 獲取最小延遲 DOWNLOAD_DELAY
    2. 獲取最大延遲 AUTOTHROTTLE_MAX_DELAY
    3. 設置初始下載延遲 AUTOTHROTTLE_START_DELAY
    4. 當請求下載完成後,獲取其"鏈接"時間 latency,即:請求鏈接到接受到響應頭之間的時間
    5. 用於計算的... AUTOTHROTTLE_TARGET_CONCURRENCY
    target_delay = latency / self.target_concurrency
    new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延遲時間
    new_delay = max(target_delay, new_delay)
    new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
    slot.delay = new_delay
"""



# 開始自動限速
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下載延遲
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 最大下載延遲
# AUTOTHROTTLE_MAX_DELAY = 10
# The average number of requests Scrapy should be sending in parallel to each remote server
# 平均每秒併發數
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:
# 是否顯示
# AUTOTHROTTLE_DEBUG = True

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

"""

3. 設置代理

3.1 scrapy默認代理

在環境變量中設置代理html

# 通常不用,取代理費事

from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware

方式一:使用默認
os.environ = 
{
    http_proxy:http://root:woshiniba@192.168.11.11:9999/
    https_proxy:http://192.168.11.11:9999/
}

3.2 自定義scrapy代理

def to_bytes(text, encoding=None, errors='strict'):
        if isinstance(text, bytes):
            return text
        if not isinstance(text, six.string_types):
            raise TypeError('to_bytes must receive a unicode, str or bytes '
                            'object, got %s' % type(text).__name__)
        if encoding is None:
            encoding = 'utf-8'
        return text.encode(encoding, errors)
        
    class ProxyMiddleware(object):
        def process_request(self, request, spider):
            # 這裏是寫死的代理,能夠經過一個函數獲取
            PROXIES = [
                {'ip_port': '111.11.228.75:80', 'user_pass': ''},
                {'ip_port': '120.198.243.22:80', 'user_pass': ''},
                {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
                {'ip_port': '101.71.27.120:80', 'user_pass': ''},
                {'ip_port': '122.96.59.104:80', 'user_pass': ''},
                {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
            ]
            proxy = random.choice(PROXIES)
            if proxy['user_pass'] is not None:
                request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
                encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
                request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
                print "**************ProxyMiddleware have pass************" + proxy['ip_port']
            else:
                print "**************ProxyMiddleware no pass************" + proxy['ip_port']
                request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
    
    # 在配置文件中註冊中間件
    DOWNLOADER_MIDDLEWARES = {
       'step8_king.middlewares.ProxyMiddleware': 500,
    }

4. https證書

20. Https訪問
    Https訪問時有兩種狀況:
    1. 要爬取網站使用的可信任證書(默認支持)
        DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
        DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
        
        
        
        
    2. 要爬取網站使用的自定義證書
        DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
        DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
        
        # https.py
        from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
        from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
        
        class MySSLFactory(ScrapyClientContextFactory):
            def getCertificateOptions(self):
                from OpenSSL import crypto
                v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
                v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
                return CertificateOptions(
                    privateKey=v1,  # pKey對象
                    certificate=v2,  # X509對象
                    verify=False,
                    method=getattr(self, 'method', getattr(self, '_ssl_method', None))
                )

5. 小結

那麼多東西,最經常使用的實際也就那幾個而已python

  • 爬蟲名稱 啓動爬蟲,以及網站接收到的請求的名稱web

  • robots協議算法

  • 默認請求頭緩存

    • useragent
  • 限速 智能限速cookie

  • 單ip、單域名的併發數併發

  • 代理(之後用的最多)app

相關文章
相關標籤/搜索