import requests from lxml.html import etree url = 'http://www.kuaidaili.com/free/' rp =requests.get(url) rp_html = etree.HTML(rp.text) #找xpath ip_xpath = '//*[@id="list"]/table/tbody/tr/td[1]/text()' port_xpath = '//*[@id="list"]/table/tbody/tr/td[2]/text()' http_or_https_xpath ='//*[@id="list"]/table/tbody/tr/td[4]/text()' #匹配內容 ip_list = rp_html.xpath(ip_xpath) port_list = rp_html.xpath(port_xpath) http_or_https_list = rp_html.xpath(http_or_https_xpath) #進行組合 list_zip = zip(ip_list,port_list,http_or_https_list) proxy_dict= {} proxy_list = [] for ip,port,http_or_https in list_zip: proxy_dict[http_or_https] = f'{ip}:{port}' proxy_list.append(proxy_dict) proxy_dict = {} print(proxy_list) #list就是啦,大家能夠用random模塊隨機選一個進行後續的爬取 #一頁不夠嘛那咱們就爬十寫 #先看規則 ''' 第一頁:https://www.kuaidaili.com/free/inha/1/ 第二頁: https://www.kuaidaili.com/free/inha/2/ 後面就不用說了吧 '''
http://www.kuaidaili.com/free/
這個ip代理網站不錯哈html