python爬蟲實戰：爬取西刺代理的代理ip（二）

時間 2019-12-13

原文原文鏈接

爬蟲實戰（二）：爬取西刺代理的代理ip

對於剛入門的同窗來講，本次實戰稍微有點難度，可是簡單的爬取圖片、文本之類的又沒養分，因此此次我選擇了爬取西刺代理的ip地址，爬取的代理ip也能在之後的學習中用到

本次實戰用的主要知識不少，其中包括：

requests.Session()自動保存cookie
利用抓包工具獲取cookie；
BeautifulSoup和xpath匹配html文檔中的標籤
subprocess測試ip並獲取運行時間及匹配的丟包數

代碼以下：

"""
案例名稱：學習使用ip代理池
需求：從網上找一個代理ip的網站，而後獲取網站上的
100個ip，組成代理ip池，而後隨機抽取其中一個ip，
並對該ip進行連通性測試，若是該ip能夠，咱們能夠將
該ip做爲代理ip來使用

思路：
    1，先獲取西刺代理網站上的ip(100)
    2, 隨機抽取其中一個ip，並檢測其連通性
    3，若是該ip可用，則能夠做爲代理ip使用
編碼：
測試：
"""

import requests
from bs4 import BeautifulSoup
from lxml import etree
import subprocess as sp
import random
import re


"""
函數說明:獲取代理ip網站的ip
"""
def get_proxys(page):
    #requests的Session()能夠自動保存cookie，
    #不須要本身維護cookie內容
    S = requests.Session()
    #目標網址的url
    target_url = 'http://www.xicidaili.com/nn/%d' %page
    target_headers = {
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Referer': 'http://www.xicidaili.com/nn/',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    target_response = S.get(url=target_url,
                            headers=target_headers)
    target_response.encoding = 'utf-8'
    target_html = target_response.text
    # print(target_html)

    #解析數據（ip,port,protocol）
    bf1_ip_list = BeautifulSoup(target_html,'lxml')
    bf2_ip_list = BeautifulSoup(str(bf1_ip_list.find_all(id='ip_list')),'lxml')
    ip_list_info = bf2_ip_list.table.contents

    proxys_list = []
    for index in range(len(ip_list_info)):
        if index % 2 == 1 and index != 1:
            dom = etree.HTML(str(ip_list_info[index]))
            ip = dom.xpath('//td[2]')
            port = dom.xpath('//td[3]')
            protocol = dom.xpath('//td[6]')
            proxys_list.append(protocol[0].text.lower()
                               + "#" + ip[0].text
                               + "#" + port[0].text)
    return proxys_list

"""
函數說明:檢測代理ip的連通性
參數:
    ip--代理的ip地址
    lose_time--匹配的丟包數
    waste_time--匹配平均時間
返回值:
    average_time--代理ip的平均耗時
"""
def check_ip(ip, lose_time, waste_time):
    cmd = "ping -n 3 -w 3 %s"
    #執行命令
    p = sp.Popen(cmd %ip, stdin=sp.PIPE,
                 stdout=sp.PIPE,
                 stderr=sp.PIPE,
                 shell=True)
    #獲取返回結果並解碼
    out = p.stdout.read().decode('GBK')
    lose_time = lose_time.findall(out)

    if len(lose_time) == 0:
        lose = 3
    else:
        lose = int(lose_time[0])
    #若是丟包數大於2，那麼咱們返回平均耗時1000
    if lose > 2:
        #返回false（1000）
        return 1000
    else:
        #平均時間
        average = waste_time.findall(out)
        if len(average) == 0:
            return 1000
        else:
            average_time = int(average[0])
            #返回平均耗時
            return average_time


"""
函數說明:初始化正則表達式
返回值:
    lose_time--匹配丟包數
    waste_time--匹配平均時間
"""
def initpattern():
    #匹配丟包數
    lose_time = re.compile(u"丟失 = (\d+)",re.IGNORECASE)
    #匹配平均時間
    waste_time = re.compile(u"平均 = (\d+)ms",re.IGNORECASE)
    return lose_time, waste_time


if __name__ == '__main__':
    #初始化正則表達式
    lose_time, waste_time = initpattern()
    #獲取ip代理
    proxys_list = get_proxys(1)


    #若是平均時間超過200ms，則從新選取ip
    while True:
        #從100個ip中隨機選取一個ip做爲代理進行網絡訪問
        proxy = random.choice(proxys_list)
        split_proxy = proxy.split('#')
        #獲取ip
        ip = split_proxy[1]
        #檢查ip
        average_time = check_ip(ip, lose_time, waste_time)

        if average_time > 200:
            #去掉不能使用的ip
            proxys_list.remove(proxy)
            print("ip連接超時，從新獲取中...")
        else:
            break

    proxys_list.remove(proxy)
    proxys_dict = {split_proxy[0]:split_proxy[1]
                    + ":" + split_proxy[2]}
    print("使用代理:", proxys_dict)

今天的代碼有點難以理解，可是要按照代碼步驟及規範理解起來並不難，小夥伴們加油，我本身也加油！html

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。