設置代理ip只須要,自定義一箇中間件,重寫process_request方法,python
request.meta['proxy'] = "http://103.112.213.146:1080" 設置代理IPmysql
中間件,注意將中間件註冊到配置文件裏去web
from adc.daili_ip.sh_yong_ip.sh_yong_ip import sui_ji_hq_ip from fake_useragent import UserAgent #導入瀏覽器用戶代理模塊 class RequestsUserAgentmiddware(object): #自定義瀏覽器代理中間件 #中間件隨機更換Requests請求頭信息的User-Agent瀏覽器用戶代理 def __init__(self,crawler): super(RequestsUserAgentmiddware, self).__init__() #獲取上一級父類基類的,__init__方法裏的對象封裝值 self.ua = UserAgent() #實例化瀏覽器用戶代理模塊類 self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random') #獲取settings.py配置文件裏的RANDOM_UA_TYPE配置的瀏覽器類型,若是沒有,默認random,隨機獲取各類瀏覽器類型 @classmethod #函數上面用上裝飾符@classmethod,函數裏有一個必寫形式參數cls用來接收當前類名稱 def from_crawler(cls, crawler): #重載from_crawler方法 return cls(crawler) #將crawler爬蟲返回給類 def process_request(self, request, spider): #重載process_request方法 def get_ua(): #自定義函數,返回瀏覽器代理對象裏指定類型的瀏覽器信息 return getattr(self.ua, self.ua_type) sssf = get_ua() print('啓用用戶代理瀏覽器信息:{0}'.format(sssf)) request.headers.setdefault('User-Agent', get_ua()) #將瀏覽器代理信息添加到Requests請求 class MyproxiesSpiderMiddleware(object): #中間件隨機更換IP def process_request(self, request, spider): #重寫process_request方法 #到數據庫隨機獲取一個IP xieyi = request._get_url() #_get_url能夠獲取到請求URL,來判斷是什麼協議請求如https print(xieyi) dai_ip = sui_ji_hq_ip('http') #到數據庫隨機獲取一個代理IP request.meta['proxy'] = "http://{0}".format(dai_ip) #字符串格式化設置代理IP #request.meta['proxy'] = "http://185.82.203.146:1080" 設置代理IP
隨機數據庫獲取IPsql
#!/usr/bin/env python # -*- coding:utf8 -*- import time import requests from adc.daili_ip.mysq import shujuku as ORM def suiji_ip(rst): """ 調用此函數隨機到數據庫獲取代理IP返回IP,若是IP不可用會自動刪除返回False """ atime = time.localtime(time.time()-240) #設置獲取多少時間之內檢測過的IP(單位秒) sudu = '00:00:03' #設置獲取訪問速度小於等於多少的IP,單位(時分秒)默認3秒 dqatime = "{0}-{1}-{2} {3}:{4}:{5}".format( atime.tm_year, atime.tm_mon, atime.tm_mday, atime.tm_hour, atime.tm_min, atime.tm_sec ) # 將格式化時間日期,單獨取出來拼接成一個完整日期 try: mysq = ORM.session() shuju = mysq.query( ORM.daili_ip.ip, ORM.daili_ip.port, ORM.daili_ip.xtype, ORM.daili_ip.seshi_ri_qi, ORM.daili_ip.connectTimeMs ).from_statement( "SELECT ip,port,xtype,seshi_ri_qi,connectTimeMs FROM daili_ip WHERE xtype='{0}' AND ce_shi='{1}' AND seshi_ri_qi>='{2}' AND connectTimeMs<='{3}' ORDER BY RAND() LIMIT 1".format(rst, '1', dqatime, sudu) ).all() mysq.close() if shuju: print('獲取到IP') else: print('獲取IP失敗,請檢查獲取條件') except Exception as e: print('查詢代理IP數據出錯') return True ip = shuju[0][0] duan_kou = shuju[0][1] print('啓用代理IP,數據庫獲取到IP:{0}'.format(shuju)) http_url = '{0}://image.baidu.com/'.format(rst) proxy_url = '{0}://{1}:{2}'.format(rst, ip, duan_kou) headers = { 'Referer': http_url, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', } print('啓用代理IP,測試網址:{0}'.format(http_url)) print('啓用代理IP,測試頭:{0}'.format(proxy_url)) try: proxy_dict = { 'http': proxy_url } response = requests.get(http_url, proxies=proxy_dict, headers=headers) except Exception as e: print('啓用代理IP,測速鏈接失敗{0}'.format(e)) print('啓用代理IP,測速鏈接失敗,當前IP不可用,刪除當前ip!') fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).delete() # 刪除不能夠數據 mysq.commit() mysq.close() if fanhui == 1: print("成功刪除當前IP") else: print('刪除當前IP失敗') return False else: code = response.status_code # 獲取狀態嗎 sudu = str(response.elapsed) # 獲取響應時間 if code >= 200 and code < 300: atime = time.localtime() dqatime = "{0}-{1}-{2} {3}:{4}:{5}".format( atime.tm_year, atime.tm_mon, atime.tm_mday, atime.tm_hour, atime.tm_min, atime.tm_sec ) # 將格式化時間日期,單獨取出來拼接成一個完整日期 print('啓用代理IP,測試代理ip--{0}{1}--狀態可用--狀態碼--{2}'.format(ip, duan_kou, code)) print('啓用代理IP,當前IP能夠,正在向數據庫標記') fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).update({ "ce_shi": "1", "seshi_ri_qi": dqatime, "connectTimeMs": sudu }) mysq.commit() mysq.close() if fanhui == 1: print('向數據庫成功標記可用IP!') else: print('向數據庫標記可用IP失敗!!!') print('向爬蟲返回IP:{0}:{1}'.format(ip, duan_kou)) return ip + ':' + duan_kou else: print('啓用代理IP,測試代理ip--{0}{1}--狀態不可用--狀態碼--{2}'.format(ip, duan_kou, code)) print('返回狀態碼不能夠,正在向數據庫刪除當前IP') fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).delete() # 刪除不能夠數據 mysq.commit() mysq.close() if fanhui == 1: print('刪除當前IP成功') else: print('刪除當前IP失敗') return False def sui_ji_hq_ip(rst): """ 正式使用:調用此函數,接收一個參數協議,如http 循環到數據庫獲取IP,IP若是不可用刪除後繼續獲取,直到ip能夠後返回ip 值循環獲取測試30分鐘內有效的IP """ n = True h = None while n: youxiao_ip = suiji_ip(rst) if youxiao_ip: h = youxiao_ip n = False return h # print(sui_ji_hq_ip('http'))
數據庫模塊文件數據庫
import sqlalchemy from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Index,text,DATETIME,TIME from sqlalchemy.orm import sessionmaker, relationship from sqlalchemy import create_engine import requests import json import time import datetime #配置數據庫引擎信息 ENGINE = create_engine("mysql+pymysql://root:279819@127.0.0.1:3306/cshi?charset=utf8", max_overflow=500, echo=True) Base = declarative_base() #建立一個SQLORM基類 class daili_ip(Base): #ip池設計表 __tablename__ = 'daili_ip' id = Column(Integer, primary_key=True, autoincrement=True) ip = Column(String(300), unique=True) #IP port = Column(String(300)) #端口 city = Column(String(300)) #城市 isp = Column(String(300)) #運營商 connectTimeMs = Column(TIME()) #速度 anonymity = Column(String(300)) #匿名方式 country = Column(String(300)) #國家 xtype = Column(String(300)) #協議 zhuang_tai_ma = Column(String(300)) #狀態碼 ruku_riqi = Column(DATETIME()) #入庫日期 ce_shi = Column(String(300)) #測試狀態 seshi_ri_qi = Column(DATETIME()) #測試日期 shi_xiao_riqi = Column(DATETIME()) # 失效日期 def init_db(): Base.metadata.create_all(ENGINE) #向數據庫建立指定表 def drop_db(): Base.metadata.drop_all(ENGINE) #向數據庫刪除指定表 def session(): cls = sessionmaker(bind=ENGINE) #建立sessionmaker類,操做表 return cls() # drop_db() #刪除表 # init_db()