2七、 Python快速開發分佈式搜索引擎Scrapy精講—經過自定義中間件全局隨機更換代理IP

百度雲搜索,搜各類資料:http://www.lqkweb.com

搜網盤,搜各類資料:http://www.swpan.cn

設置代理ip只須要,自定義一箇中間件,重寫process_request方法,python

request.meta['proxy'] = "http://103.112.213.146:1080" 設置代理IPmysql

中間件,注意將中間件註冊到配置文件裏去web

from adc.daili_ip.sh_yong_ip.sh_yong_ip import sui_ji_hq_ip

from fake_useragent import UserAgent    #導入瀏覽器用戶代理模塊

class RequestsUserAgentmiddware(object):                                    #自定義瀏覽器代理中間件
    #中間件隨機更換Requests請求頭信息的User-Agent瀏覽器用戶代理
    def __init__(self,crawler):
        super(RequestsUserAgentmiddware, self).__init__()                   #獲取上一級父類基類的,__init__方法裏的對象封裝值
        self.ua = UserAgent()                                               #實例化瀏覽器用戶代理模塊類
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random')      #獲取settings.py配置文件裏的RANDOM_UA_TYPE配置的瀏覽器類型,若是沒有,默認random,隨機獲取各類瀏覽器類型

    @classmethod                                                            #函數上面用上裝飾符@classmethod,函數裏有一個必寫形式參數cls用來接收當前類名稱
    def from_crawler(cls, crawler):                                         #重載from_crawler方法
        return cls(crawler)                                                 #將crawler爬蟲返回給類

    def process_request(self, request, spider):                             #重載process_request方法
        def get_ua():                                                       #自定義函數,返回瀏覽器代理對象裏指定類型的瀏覽器信息
            return getattr(self.ua, self.ua_type)
        sssf = get_ua()
        print('啓用用戶代理瀏覽器信息:{0}'.format(sssf))
        request.headers.setdefault('User-Agent', get_ua())                  #將瀏覽器代理信息添加到Requests請求


class MyproxiesSpiderMiddleware(object):
    #中間件隨機更換IP

    def process_request(self, request, spider):                             #重寫process_request方法
        #到數據庫隨機獲取一個IP

        xieyi = request._get_url()                                          #_get_url能夠獲取到請求URL,來判斷是什麼協議請求如https
        print(xieyi)
        dai_ip = sui_ji_hq_ip('http')                                       #到數據庫隨機獲取一個代理IP
        request.meta['proxy'] = "http://{0}".format(dai_ip)                 #字符串格式化設置代理IP

        #request.meta['proxy'] = "http://185.82.203.146:1080"   設置代理IP

隨機數據庫獲取IPsql

#!/usr/bin/env python
# -*- coding:utf8 -*-
import time

import requests

from adc.daili_ip.mysq import shujuku as ORM


def suiji_ip(rst):
    """
    調用此函數隨機到數據庫獲取代理IP返回IP,若是IP不可用會自動刪除返回False
    """
    atime = time.localtime(time.time()-240)          #設置獲取多少時間之內檢測過的IP(單位秒)
    sudu = '00:00:03'                               #設置獲取訪問速度小於等於多少的IP,單位(時分秒)默認3秒
    dqatime = "{0}-{1}-{2} {3}:{4}:{5}".format(
        atime.tm_year,
        atime.tm_mon,
        atime.tm_mday,
        atime.tm_hour,
        atime.tm_min,
        atime.tm_sec
    )  # 將格式化時間日期,單獨取出來拼接成一個完整日期

    try:
        mysq = ORM.session()
        shuju = mysq.query(
            ORM.daili_ip.ip,
            ORM.daili_ip.port,
            ORM.daili_ip.xtype,
            ORM.daili_ip.seshi_ri_qi,
            ORM.daili_ip.connectTimeMs
        ).from_statement(
            "SELECT ip,port,xtype,seshi_ri_qi,connectTimeMs FROM daili_ip WHERE xtype='{0}' AND ce_shi='{1}' AND seshi_ri_qi>='{2}' AND connectTimeMs<='{3}' ORDER BY RAND() LIMIT 1".format(rst, '1', dqatime, sudu)
        ).all()
        mysq.close()
        if shuju:
            print('獲取到IP')
        else:
            print('獲取IP失敗,請檢查獲取條件')
    except Exception as e:
        print('查詢代理IP數據出錯')
        return True
    ip = shuju[0][0]
    duan_kou = shuju[0][1]
    print('啓用代理IP,數據庫獲取到IP:{0}'.format(shuju))

    http_url = '{0}://image.baidu.com/'.format(rst)
    proxy_url = '{0}://{1}:{2}'.format(rst, ip, duan_kou)
    headers = {
        'Referer': http_url,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
    }

    print('啓用代理IP,測試網址:{0}'.format(http_url))
    print('啓用代理IP,測試頭:{0}'.format(proxy_url))
    try:
        proxy_dict = {
            'http': proxy_url
        }
        response = requests.get(http_url, proxies=proxy_dict, headers=headers)
    except Exception as e:
        print('啓用代理IP,測速鏈接失敗{0}'.format(e))
        print('啓用代理IP,測速鏈接失敗,當前IP不可用,刪除當前ip!')
        fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).delete()  # 刪除不能夠數據
        mysq.commit()
        mysq.close()
        if fanhui == 1:
            print("成功刪除當前IP")
        else:
            print('刪除當前IP失敗')
        return False
    else:
        code = response.status_code  # 獲取狀態嗎
        sudu = str(response.elapsed)  # 獲取響應時間
        if code >= 200 and code < 300:
            atime = time.localtime()
            dqatime = "{0}-{1}-{2} {3}:{4}:{5}".format(
                atime.tm_year,
                atime.tm_mon,
                atime.tm_mday,
                atime.tm_hour,
                atime.tm_min,
                atime.tm_sec
            )  # 將格式化時間日期,單獨取出來拼接成一個完整日期

            print('啓用代理IP,測試代理ip--{0}{1}--狀態可用--狀態碼--{2}'.format(ip, duan_kou, code))
            print('啓用代理IP,當前IP能夠,正在向數據庫標記')
            fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).update({
                "ce_shi": "1",
                "seshi_ri_qi": dqatime,
                "connectTimeMs": sudu
            })
            mysq.commit()
            mysq.close()
            if fanhui == 1:
                print('向數據庫成功標記可用IP!')
            else:
                print('向數據庫標記可用IP失敗!!!')
            print('向爬蟲返回IP:{0}:{1}'.format(ip, duan_kou))
            return ip + ':' + duan_kou
        else:
            print('啓用代理IP,測試代理ip--{0}{1}--狀態不可用--狀態碼--{2}'.format(ip, duan_kou, code))
            print('返回狀態碼不能夠,正在向數據庫刪除當前IP')
            fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).delete()  # 刪除不能夠數據
            mysq.commit()
            mysq.close()
            if fanhui == 1:
                print('刪除當前IP成功')
            else:
                print('刪除當前IP失敗')
            return False


def sui_ji_hq_ip(rst):
    """
    正式使用:調用此函數,接收一個參數協議,如http
    循環到數據庫獲取IP,IP若是不可用刪除後繼續獲取,直到ip能夠後返回ip
    值循環獲取測試30分鐘內有效的IP
    """
    n = True
    h = None
    while n:
        youxiao_ip = suiji_ip(rst)
        if youxiao_ip:
            h = youxiao_ip
            n = False
    return h

# print(sui_ji_hq_ip('http'))

數據庫模塊文件數據庫

import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Index,text,DATETIME,TIME
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy import create_engine

import requests
import json
import time
import datetime


#配置數據庫引擎信息
ENGINE = create_engine("mysql+pymysql://root:279819@127.0.0.1:3306/cshi?charset=utf8", max_overflow=500, echo=True)

Base = declarative_base()       #建立一個SQLORM基類

class daili_ip(Base):            #ip池設計表
    __tablename__ = 'daili_ip'

    id = Column(Integer, primary_key=True, autoincrement=True)
    ip = Column(String(300), unique=True)       #IP
    port = Column(String(300))                  #端口
    city = Column(String(300))                  #城市
    isp = Column(String(300))                   #運營商
    connectTimeMs = Column(TIME())              #速度
    anonymity = Column(String(300))             #匿名方式
    country = Column(String(300))               #國家
    xtype = Column(String(300))                 #協議
    zhuang_tai_ma = Column(String(300))         #狀態碼
    ruku_riqi = Column(DATETIME())             #入庫日期
    ce_shi = Column(String(300))                #測試狀態
    seshi_ri_qi = Column(DATETIME())           #測試日期
    shi_xiao_riqi = Column(DATETIME())         # 失效日期


def init_db():
    Base.metadata.create_all(ENGINE)        #向數據庫建立指定表

def drop_db():
    Base.metadata.drop_all(ENGINE)          #向數據庫刪除指定表

def session():
    cls = sessionmaker(bind=ENGINE)         #建立sessionmaker類,操做表
    return cls()


# drop_db()         #刪除表
# init_db()
相關文章
相關標籤/搜索