見scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddlewarephp
import os # 設置相應的代理用戶名密碼,主機和端口號 os.environ["http_proxy"] = "http://user:password@proxy.internal.server.com:8080" class YourCrawlSpider(CrawlSpider): """略"""
class ProxyMiddleware(object): # overwrite process request def process_request(self, request, spider): # 設置代理的主機和端口號 request.meta['proxy'] = "http://proxy.internal.server.com:8080" # 設置代理的認證用戶名和密碼 proxy_user_pass = "user:password" encoded_user_pass = base64.encodestring(proxy_user_pass) # 設置代理 request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
DOWNLOADER_MIDDLEWARES = { 'middlewares.ProxyMiddleware': 90, }
補充html
class RandomProxy(object): def process_request(self, request, spider): proxy = random.choice(PROXIES) if proxy['user_passwd'] is None: # 沒有代理帳戶驗證的免費代理的使用 request.meta['proxy'] = "http//" + proxy['ip_port'] else: request.meta['proxy'] = "http//" + proxy['ip_port'] # 對帳戶密碼進行base64編碼轉換 base64_userpasswd = base64.b64decode(proxy['user_passwd']) # 對應到代理服務器的信令格式裏 request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
PROXIES = [ {'ip_port': '111.8.60.9:8123', 'user_passwd': ''}, {'ip_port': '101.71.27.120:80', 'user_passwd': 'user2:pass2'}, {'ip_port': '122.96.59.104:80', 'user_passwd': 'user3:pass3'}, {'ip_port': '122.224.249.122:8088', 'user_passwd': 'user4:pass4'}, ] DOWNLOADER_MIDDLEWARES = { 'douban.middlewares.RandomUserAgent': 100, 'douban.middlewares.RandomProxy': 200, }
實測有效chrome
參考https://www.cnblogs.com/cnkai/p/7401526.htmlapi
class TaiyingshiSpider(scrapy.Spider): custom_settings = { "DOWNLOAD_DELAY": 0, "DOWNLOADER_MIDDLEWARES":{ # "imfcrawl.middlewares.ProxyMiddleware":543, } } """其餘略"""
PROXIES = [ "http://123.127.217.170:80", "http://223.202.204.195:80", "http://223.202.204.194:80", "http://140.143.105.246:80", ]
import random class ProxyMiddleware(object): ''' 設置Proxy ''' def __init__(self, ip): self.ip = ip @classmethod def from_crawler(cls, crawler): return cls(ip=crawler.settings.get('PROXIES')) def process_request(self, request, spider): try: proxy_ip = random.choice(self.ip) except: proxy_ip = None if proxy_ip: # print(proxy_ip) request.meta['proxy'] = proxy_ip
link:https://www.cnblogs.com/lei0213/p/7904994.html瀏覽器
一、咱們都知道scrapy的基本請求步驟是,首先執行父類裏面(scrapy.Spider)裏面的start_requests方法,服務器
二、而後start_requests方法也是取拿咱們設置的start_urls變量裏面的url地址dom
三、最後才執行make_requests_from_url方法,並只傳入一個url變量scrapy
那麼,咱們就能夠重寫make_requests_from_url方法,從而直接調用scrapy.Request()方法,咱們簡單的瞭解一下里面的幾個參數:ide
一、url=url,其實就是最後start_requests()方法裏面拿到的url地址函數
二、meta這裏咱們只設置了一個參數,download_timeout:10,做用就是當第一次發起請求的時候,等待10秒鐘,若是沒有請求成功的話,就會直接執行download_middleware裏面的方法,咱們下面介紹。
三、callback回調函數,其實就是本次的本次全部操做完成後執行的操做,注意,這裏可不是說執行完上面全部操做後,再執行這個操做,好比說請求了一個url,而且成功了,下面就會執行這個方法。
四、dont_filter=False,這個很重要,有人說過不加的話默認就是False,可是親測必須得加,做用就是scrapy默認有去重的方法,等於False的話就意味着不參加scrapy的去重操做。親測,請求一個頁面,拿到第一個頁面後,抓取想要的操做後,第二頁就不行了,只有加上它才能夠。
import scrapy class HttpbinTestSpider(scrapy.Spider): name = "httpbin_test" allowed_domains = ["httpbin.ort/get"] start_urls = ['http://httpbin.org/get'] def make_requests_from_url(self,url): self.logger.debug('Try first time') return scrapy.Request(url=url,meta={'download_timeout':10},callback=self.parse,dont_filter=False) def parse(self, response): print(response.text)
下面就是上面請求10秒後超時會執行的操做process_exception方法,心細的同窗會發現,咱們在spider文件裏面輸出log的時候,是直接輸出的,那是由於scrapy早都在父類裏面給你定義好了,直接應用就行,可是在middlewares裏面須要本身定義一個類變量定義,才能使用引用。
class HttpbinProxyMiddleware(object): logger = logging.getLogger(__name__) # def process_request(self, request, spider): # # pro_addr = requests.get('http://127.0.0.1:5000/get').text # # request.meta['proxy'] = 'http://' + pro_addr # pass # # def process_response(self, request, response, spider): # # 能夠拿到下載完的response內容,而後對下載完的內容進行修改(修改文本的編碼格式等操做) # pass def process_exception(self, request, response, spider): self.logger.debug('Try Exception time') self.logger.debug('Try second time') proxy_addr = requests.get('http://127.0.0.1:5000/get').text self.logger.debug(proxy_addr) request.meta['proxy'] = 'http://{0}'.format(proxy_addr)
這裏纔是關鍵,咱們須要執行middlewares裏面的HttpbinProxyMiddleware類下面的方法,這裏須要注意的是我取消了下載中間件的retry中間件,由於scrapy自己就有自動重試的方法,爲了試驗效果,這裏取消了默認的重試中間件。
DOWNLOADER_MIDDLEWARES = { 'httpbin.middlewares.HttpbinProxyMiddleware': 543, #設置不參與scrapy的自動重試的動做 'scrapy.downloadermiddlewares.retry.RetryMiddleware':None }
user-agent是我們模擬瀏覽器比較重要的參數,主要是防止爬蟲被ban,前幾章咱們瞭解到在settings.py中能夠設置user-agent,如:
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
1
可是這種狀況仍是有被禁(ban)的狀況,另外咱們再增長延時爬取,也能夠減少被ban的風險,固然這些都是比較簡單的假裝技巧,做爲爬蟲入門已經夠用。
那麼咱們設置更多的user-agent來模擬瀏覽器下載網頁數據,每次下載時隨機設置一個user-agent,這樣更不容易被ban。
import random from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware class RotateUserAgentMiddleware(UserAgentMiddleware): def __init__(self, user_agent=''): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: # print(ua) request.headers.setdefault('User-Agent', ua) # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 這個是scrapy自有的,須要None掉,後面給出該中間件代碼 'spider名稱.middlewares.RotateUserAgentMiddleware': 543, }
from scrapy import signals class UserAgentMiddleware(object): """This middleware allows spiders to override the user_agent""" def __init__(self, user_agent='Scrapy'): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): o = cls(crawler.settings['USER_AGENT']) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o def spider_opened(self, spider): self.user_agent = getattr(spider, 'user_agent', self.user_agent) # 爬蟲開始時若是用戶在spider設置了user_agent,則使用該agent def process_request(self, request, spider): if self.user_agent: request.headers.setdefault(b'User-Agent', self.user_agent)
import logging import random import time class RandomDelayMiddleware(object): def __init__(self,crawler): self.api_url = crawler.spider.settings.get("API_URL") # 略 self.delay = crawler.spider.settings.get("RANDOM_DELAY") # 在settings上設置一個delay最大值 @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): # print(request.url) if request.url == self.api_url: delay = random.randint(0, self.delay) logging.debug("### random delay: %s s ###" % delay) time.sleep(delay) # 隨機延遲 # time.sleep(0.5) # 固定延遲
class TaiyingshiSpider(scrapy.Spider): name = 'taiyingshi' allowed_domains = ['taiyingshi.com',"127.0.0.1"] start_urls = ['http://www.taiyingshi.com/last/'] # start_urls = ['http://www.taiyingshi.com/people/xj25.html'] "DOWNLOAD_DELAY": 0, # scrapy自帶的延遲,實際延遲須要與下面相加 "RANDOM_DELAY" : 3, # 最大隨機延遲,最終延遲與上面相加 "DOWNLOADER_MIDDLEWARES":{ # "imfcrawl.middlewares.ProxyMiddleware":543, 'imfcrawl.middlewares.RandomDelayMiddleware':550 # 啓用中間件 }, }
實測有效。
https://www.cnblogs.com/cnkai/p/7401526.html
https://www.cnblogs.com/lei0213/p/7904994.html
https://blog.csdn.net/yancey_blog/article/details/53896092