urllib練習

時間 2019-11-29

標籤 urllib 練習简体版

原文原文鏈接

# coding = utf-8
"""
解析https://www.kuaidaili.com/free/網站裏的代理地址，
並測試是否可用
"""

import re
import time
import urllib.request

def downHtml(url, retry=3):
    """
    請求網頁地址並下載源代碼，若是請求失敗，重試三次
    :param url: 請求地址
    :param retry: 重試次數
    :return: 網頁源碼
    """
    try:
        request = urllib.request.Request(url)
        # 獲取網頁源碼
        html = urllib.request.urlopen(request).read().decode()
        
    except urllib.error.URLError as e:
        print('請求異常：', e.reason)
        if retry > 0:
            time.sleep(2)  # 兩秒後重試
            downHtml(url, retry=retry-1)
        else:
            return None
    else:
        return html

def getProxy(html):
    """
    使用正則表達式，從源碼中匹配出全部的代理
    :param html: 網頁源碼
    :return: 列表，包含匹配的代理
    """
    proxies = re.findall(r'<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', html, re.S)

    return proxies

def isAbleToUse(ips):
    """
    利用ip測試網站，判斷獲取的代理是否可用
    :param ips: 匹配到的代理ip
    :return: 
    """
    # 測試網站
    url = "http://httpbin.org/ip"
    # 構造代理
    proxy = {'http': '{}:{}'.format(ips[0], ips[1]), 'https': '{}:{}'.format(ips[0], ips[1])}

    # 建立代理處理器
    proxies = urllib.request.ProxyHandler(proxy)
    
    # 建立opener處理對象
    opener = urllib.request.build_opener(proxies, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    
    try:
        data = opener.open(url).read().decode()  # 請求
        print(data)
    except Exception as e:
        print(e)
    else:
        print('{}：{}'.format(ips[0], ips[1]), '可用！')


if __name__ == '__main__':
    url = "https://www.kuaidaili.com/free/"
    
    # 獲取源碼
    html = downHtml(url)
    # 從源碼中解析代理
    proxies = getProxy(html)
    # 測試代理是否可用
    for proxy in proxies:
        isAbleToUse(proxy)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。