第一步: 經過 ProxyBroker 獲取代理html
#Proxy-pool-gather.py import asyncio import datetime import logging import redis from proxybroker import Broker r=redis.Redis(host='localhost',encoding="UTF-8",decode_responses=True) expire_time_s=60*60*24 #一天後過時 async def save(proxies): while True: proxy=await proxies.get() if proxy is None: break if "HTTP" not in proxy.types: continue if "High" == proxy.types["HTTP"]: print(proxy) row='%s://%s:%d' % ("http",proxy.host,proxy.port) r.set(row,0,ex=expire_time_s) while True: proxies=asyncio.Queue() broker=Broker(proxies,timeout=2,max_tries=2,grab_timeout=3600) tasks=asyncio.gather(broker.find(types=["HTTP","HTTPS"]),save(proxies)) loop=asyncio.get_event_loop() loop.run_until_complete(tasks)
第二步: HTTP服務器展現代理列表: http://0.0.0.0:8000/proxy.jsonredis
#Proxy-http-server.py from flask import Flask from flask_restful import Resource,Api import redis app=Flask(__name__) api=Api(app) r=redis.Redis(host="localhost",encoding="UTF-8",decode_responses=True) class Proxy(Resource): def get(self): return r.keys("*") api.add_resource(Proxy,"/proxy.json") if __name__ == "__main__": app.run(host="0.0.0.0",port=8000)
第三步: 結果測試json
#Demo.py import time import json import requests class test_proxy(object): def __init__(self): self.pro_url="http://0.0.0.0:8000/proxy.json" self.test_url="http://httpbin.org/get" self.headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"} def get_proxy(self): res=requests.get(self.pro_url,self.headers).content.decode() proxy_list=json.loads(res) for p in proxy_list: proxies={"http": str(p.split("://")[1])} yield proxies def delete_proxy(self,proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) # def get_test(self,proxies): # res=requests.get(self.test_url,proxies) # return res.content.decode("utf-8") # if res.status_code == 200: # return "ip有效" # else: # return "ip無效" def get_html(self,proxies): retry_count = 5 while retry_count > 0: try: html = requests.get('https://www.example.com', proxies=proxies) # 使用代理訪問 return html except Exception: retry_count -= 1 # 出錯5次, 刪除代理池中代理 return None def start(self): proxy_list=self.get_proxy() for p in proxy_list: print(p) result=self.get_html(p) if result.status_code==200: print(result.text) if __name__=='__main__': test_proxy().start()
增長國內代理flask
class Kuaidaili(Provider): domain="kuaidaili.com" async def _pipe(self): urls=["http://www.kuaidaili.com/free/inha/%d" % n for n in range(1,21)] urls += ["http://www.kuaidaili.com/free/intr/%d" % n for n in range(1,21)] await self._find_on_pages(urls) PROVIDERS=[ Kuaidaili(), ]