python爬蟲要經歷爬蟲、爬蟲被限制、爬蟲反限制的過程。固然後續還要網頁爬蟲限制優化,爬蟲再反限制的一系列道高一尺魔高一丈的過程。python
爬蟲的初級階段,添加headers和ip代理能夠解決不少問題。網頁爬蟲
貼代碼:說下思路多線程
一、到http://www.xicidaili.com/nn/抓取相應的代理ip地址,地址比較多,可是不保證能用。先保存到列表app
二、多線程驗證代理ip的可行性,而後寫入到對應的txt文件python爬蟲
三、當須要代理ip的時候,倒入模塊,執行main()函數,可獲得可用的代理ip進行後續功能。dom
驗證ip用到了telnetlib和requests兩種方法。建議要爬取哪一個網頁,直接requests相應網頁驗證比較好。socket
#coding:utf-8 from bs4 import BeautifulSoup import time import threading import random import telnetlib,requests
#設置全局超時時間爲3s,也就是說,若是一個請求3s內尚未響應,就結束訪問,並返回timeout(超時) import socket socket.setdefaulttimeout(3) headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36", } def get_ip(): #獲取代理IP,返回列表 httpResult=[] httpsResult=[] try: for page in range(1,2): IPurl = 'http://www.xicidaili.com/nn/%s' %page rIP=requests.get(IPurl,headers=headers) IPContent=rIP.text print IPContent soupIP = BeautifulSoup(IPContent,'lxml') trs = soupIP.find_all('tr') for tr in trs[1:]: tds = tr.find_all('td') ip = tds[1].text.strip() port = tds[2].text.strip() protocol = tds[5].text.strip() if protocol == 'HTTP': httpResult.append( 'http://' + ip + ':' + port) elif protocol =='HTTPS': httpsResult.append( 'https://' + ip + ':' + port) except: pass return httpResult,httpsResult ''' #驗證ip地址的可用性,使用telnetlib模塊_http def cip(x,y): f = open("E:\ip_http.txt","a") f.truncate() try: telnetlib.Telnet(x, port=y, timeout=5) except: print('f') else: print('---------------------------success') f.write(x+':'+y+'\n') #驗證ip地址的可用性,使用telnetlib模塊_https def csip(x,y): f = open("E:\ip_https.txt","a") f.truncate() try: telnetlib.Telnet(x, port=y, timeout=5) except: print('f') else: print('---------------------------success') f.write(x+':'+y+'\n') ''' #驗證ip地址的可用性,使用requests模塊,驗證地址用相應要爬取的網頁 http def cip(x,y): f = open("E:\ip_http.txt","a") f.truncate() try: print (x+y) requests.get('http://ip.chinaz.com/getip.aspx',proxies={'http':x+":"+y},timeout=3) except: print('f') else: print('---------------------------success') f.write(x+':'+y+'\n') #驗證ip地址的可用性,使用requests模塊,驗證地址用相應要爬取的網頁。https def csip(x,y): f = open("E:\ip_https.txt","a") f.truncate() try: print (x+y) requests.get('https://www.lagou.com/',proxies={'https':x+":"+y},timeout=3) except: print('f') else: print('---------------------------success') f.write(x+':'+y+'\n') def main(): httpResult,httpsResult = get_ip() threads = [] open("E:\ip_http.txt","a").truncate() for i in httpResult: a = str(i.split(":")[-2][2:].strip()) b = str(i.split(":")[-1].strip()) t = threading.Thread(target=cip,args=(a,b,)) threads.append(t) for i in range(len(httpResult)): threads[i].start() for i in range(len(httpResult)): threads[i].join() threads1 = [] open("E:\ip_https.txt","a").truncate() for i in httpsResult: a = str(i.split(":")[-2][2:].strip()) b = str(i.split(":")[-1].strip()) t = threading.Thread(target=csip,args=(a,b,)) threads1.append(t) for i in range(len(httpsResult)): threads1[i].start() for i in range(len(httpsResult)): threads1[i].join() if __name__ == '__main__': main()