# -*- coding: utf-8 -*-html
# Scrapy settings for GitHub project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlweb
# Scrapy項目的名字,這將用來構造默認 User-Agent,同時也用來log,當您使用 startproject 命令建立項目時其也被自動賦值。緩存
BOT_NAME = 'GitHub'服務器
# Scrapy搜索spider的模塊列表 默認: [xxx.spiders]cookie
SPIDER_MODULES = ['GitHub.spiders']併發
# 使用 genspider 命令建立新spider的模塊。默認: 'xxx.spiders'
NEWSPIDER_MODULE = 'GitHub.spiders'app
# 日誌的設置(日誌的級別)
# OFF、FATAL、ERROR、WARN、INFO、DEBUG、ALL六個(由高到低)
# 還有4個的比較簡單(ERROR、WARN、INFO、DEBUG)(由高到低)
LOG_LEVEL = "INFO"scrapy
# 若是不想在控制檯打印,就能夠存放在一個文件裏
LOG_FILE = "GitHub.log"ide
# 設置User-Agent(經常使用)
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'性能
# 設置是否遵照robot協議(經常使用並且通常是不遵照的)
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Scrapy downloader設置最大併發數(默認是16個,能夠本身設置更多。可是要注意電腦的性能)
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 設置延遲 (批量的)例若有16個線程,那是16個請求以後休息一段時間。而不是每個休息一段時間
# 下載器在下載同一個網站下一個頁面前須要等待的時間,該選項能夠用來限制爬取速度,減輕服務器壓力。同時也支持小數:0.25 以秒爲單位
#DOWNLOAD_DELAY = 3
# 和上方設置最大併發數是同樣功能的設置,只能有一個起做用(下載延遲設置只能有一個有效)
# The download delay setting will honor only one of:
# 對單個網站進行併發請求的最大值。
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 對單個IP進行併發請求的最大值。若是非0,則忽略 CONCURRENT_REQUESTS_PER_DOMAIN 設定,使用該設定。 也就是說,併發限制將針對IP,而不是網站。該設定也影響 DOWNLOAD_DELAY: 若是 CONCURRENT_REQUESTS_PER_IP 非0,下載延遲應用在IP而不是網站上
#CONCURRENT_REQUESTS_PER_IP = 16
# 禁用cookie(默認是啓用), 意思是禁用cookie
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# 禁用telnet控制檯(默認啓用)
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# 覆蓋默認的請求頭
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# 爬蟲中間件
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'GitHub.middlewares.MyCustomSpiderMiddleware': 543,
#}
# 下載中間件
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'GitHub.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# 啓用或者禁用擴展
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# 管道
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'GitHub.pipelines.SomePipeline': 300,
#}
# 啓用和配置AutoThrottle擴展(默認狀況下禁用)
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# 初始下載延遲
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# 在高延遲狀況下設置的最大下載延遲
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# Scrapy平均請求數應與每一個遠程服務器並行發送
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# 啓用顯示收到的每一個響應的限制狀態:
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# 啓用和配置HTTP緩存(默認狀況下禁用)# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'