目錄php
隨機User-Agent 獲取代理ip 檢測代理ip可用性
fake_useragent庫,假裝請求頭html
from fake_useragent import UserAgent ua = UserAgent() # ie瀏覽器的user agent print(ua.ie) # opera瀏覽器 print(ua.opera) # chrome瀏覽器 print(ua.chrome) # firefox瀏覽器 print(ua.firefox) # safri瀏覽器 print(ua.safari) # 最經常使用的方式 # 寫爬蟲最實用的是能夠隨意變換headers,必定要有隨機性。支持隨機生成請求頭 print(ua.random) print(ua.random) print(ua.random)
在免費的代理網站爬取代理ip,免費代理的採集也很簡單,無非就是:訪問頁面頁面 —> 正則/xpath提取 —> 保存python
代理ip網站 有代理:https://www.youdaili.net/Daili/guonei/ 66代理:http://www.66ip.cn/6.html 西刺代理:https://www.xicidaili.com/ 快代理:https://www.kuaidaili.com/free/
#根據網頁結果,適用正則表達式匹配 #這種方法適合翻頁的網頁
import re import requests import time def get_ip(): url='https://www.kuaidaili.com/free/inha/' url_list=[url+str(i+1) for i in range(5)] #生成url列表,5表明只爬取5頁 print(url_list) ip_list = [] for i in range(len(url_list)): url =url_list[i] html = requests.get(url=url,).text regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>' matcher = re.compile(regip,re.S) ipstr = re.findall(matcher, html) time.sleep(1) for j in ipstr: ip_list.append(j[0]+':'+j[1]) #ip+port print(ip_list) print('共收集到%d個代理ip' % len(ip_list)) return ip_list if __name__=='__main__': get_ip()
#先獲取特定標籤 #解析
import requests from bs4 import BeautifulSoup def get_ip_list(obj): ip_text = obj.findAll('tr', {'class': 'odd'}) # 獲取帶有IP地址的表格的全部行 ip_list = [] for i in range(len(ip_text)): ip_tag = ip_text[i].findAll('td') ip_port = ip_tag[1].get_text() + ':' + ip_tag[2].get_text() # 提取出IP地址和端口號 ip_list.append(ip_port) print("共收集到了{}個代理IP".format(len(ip_list))) print(ip_list) return ip_list url = 'http://www.xicidaili.com/' headers = { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'} request = requests.get(url, headers=headers) response =request.text bsObj = BeautifulSoup(response, 'lxml') # 解析獲取到的html lists=get_ip_list(bsObj)
第一種方法:經過返回的狀態碼判斷正則表達式
import requests import random import re import time def get_ip(): url='https://www.kuaidaili.com/free/inha/' url_list=[url+str(i+1) for i in range(1)] print(url_list) ip_list = [] for i in range(len(url_list)): url =url_list[i] html = requests.get(url=url,).text regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>' matcher = re.compile(regip,re.S) ipstr = re.findall(matcher, html) time.sleep(1) for j in ipstr: ip_list.append(j[0]+':'+j[1]) print('共收集到%d個代理ip' % len(ip_list)) print(ip_list) return ip_list def valVer(proxys): badNum = 0 goodNum = 0 good=[] for proxy in proxys: try: proxy_host = proxy protocol = 'https' if 'https' in proxy_host else 'http' proxies = {protocol: proxy_host} print('如今正在測試的IP:',proxies) response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2) if response.status_code != 200: badNum += 1 print (proxy_host, 'bad proxy') else: goodNum += 1 good.append(proxies) print (proxy_host, 'success proxy') except Exception as e: print( e) # print proxy_host, 'bad proxy' badNum += 1 continue print ('success proxy num : ', goodNum) print( 'bad proxy num : ', badNum) print(good) if __name__ == '__main__': ip_list=get_ip() valVer(ip_list)
第二種方法:使用requests包來進行驗證chrome
import requests import random import re import time def get_ip(): url='https://www.kuaidaili.com/free/inha/' url_list=[url+str(i+1) for i in range(1)] print(url_list) ip_list = [] for i in range(len(url_list)): url =url_list[i] html = requests.get(url=url,).text regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>' matcher = re.compile(regip,re.S) ipstr = re.findall(matcher, html) time.sleep(1) for j in ipstr: ip_list.append(j[0]+':'+j[1]) print(ip_list) print('共收集到%d個代理ip' % len(ip_list)) return ip_list def valVer(proxys): badNum = 0 goodNum = 0 good=[] for proxy in proxys: print("如今正在檢測ip",proxy) try: requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://"+str(proxy)}, timeout=2) except: badNum+=1 print('connect failed') else: goodNum=1 good.append(proxy) print('success') print ('success proxy num : ', goodNum) print( 'bad proxy num : ', badNum) print(good) if __name__ == '__main__': ip_list=get_ip() valVer(ip_list)
第三種方法:使用telnet瀏覽器
import telnetlib try: telnetlib.Telnet('127.0.0.1', port='80', timeout=20) except: print 'connect failed' else: print 'success'