怎麼來爬取代理服務器ip地址?(python)

一年前忽然有個靈感,想搞個強大的網盤搜索引擎,但因爲大學本科學習軟件工程偏嵌入式方向,web方面的能力有點弱,不會jsp,不懂html,很久沒有玩過sql,但就是趁着年輕人的這股不妥協的勁兒,硬是把之前沒有學習的所有給學了一遍,如今感受web原來也就那麼回事。好了,廢話就不說了,看到本文的讀者,能夠先看看我作的東西:去轉盤網html

  ok搜搜:www.oksousou.com(這個是磁力,順便拿出來給大夥觀賞)python

  言歸正傳,因爲我要爬取百度網盤,而度娘你懂的的搞爬蟲出生的,反爬蟲的能力很牛掰。尤爲像我用個人電腦去爬百度網盤,爬幾天百度就盯上了個人機子,爬蟲開始爬不出東西。以後網上東查,西查,發現能夠經過代理來解決這個問題,因此又去爬代理。我爬的是這個網站:mysql

http://www.xicidaili.com/ 以後他貌似他開始反擊,我又將魔爪指向了:http://www.kuaidaili.com
想必看這篇博文的多半是程序猿,因此仍是先上代碼(我會寫註釋的,放心,該爬蟲以http://www.xicidaili.com/爲目標):web

#coding:utf-8
 
import json
 
import sys
 
import urllib, urllib2
 
import datetime
 
import time
 
reload(sys)
 
sys.setdefaultencoding('utf-8')
 
from Queue import Queue
 
from bs4 import BeautifulSoup
 
import MySQLdb as mdb
 
DB_HOST = '127.0.0.1'
 
DB_USER = 'root'
 
DB_PASS = 'root'
 
ID=0
 
ST=1000
 
uk='3758096603'
 
classify="inha"
 
proxy = {u'https':u'118.99.66.106:8080'}
 
  
 
class ProxyServer:
 
def __init__(self): #這個就不說了,數據庫初始化,我用的是mysql
 
self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'ebook', charset='utf8')
 
self.dbconn.autocommit(False)
 
self.next_proxy_set = set()
 
self.chance=0
 
self.fail=0
 
self.count_errno=0
 
self.dbcurr = self.dbconn.cursor()
 
self.dbcurr.execute('SET NAMES utf8')
 
  
 
def get_prxy(self,num): #這個函數用來爬取代理
 
while num>0:
 
global proxy,ID,uk,classify,ST
 
count=0
 
for page in range(1,718): #代理網站總頁數,我給了個718頁
 
if self.chance >0: #羊毛出在羊身上,如過爬取網站開始反擊我,我就從他那爬下來的
 
代理假裝,這個self.chance表示我何時開始換代理
 
if ST % 100==0:
 
self.dbcurr.execute("select count(*) from proxy")
 
for r in self.dbcurr:
 
count=r[0]
 
if ST>count:
 
ST=1000 #我是從數據庫的第1000條開始換的,這段你能夠改,搞個隨機函數隨機換,我寫的很簡單
 
self.dbcurr.execute("select * from proxy where ID=%s",(ST))
 
results = self.dbcurr.fetchall()
 
for r in results:
 
protocol=r[1]
 
ip=r[2]
 
port=r[3]
 
pro=(protocol,ip+":"+port)
 
if pro not in self.next_proxy_set:
 
self.next_proxy_set.add(pro)
 
self.chance=0
 
ST+=1
 
proxy_support = urllib2.ProxyHandler(proxy) #註冊代理
 
# opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
 
opener = urllib2.build_opener(proxy_support)
 
urllib2.install_opener(opener)
 
#添加頭信息,模仿瀏覽器抓取網頁,對付返回403禁止訪問的問題
 
# i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
 
i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
 
#url='http://www.kuaidaili.com/free/inha/' + str(page)
 
url='http://www.kuaidaili.com/free/'+classify+'/' + str(page)
 
html_doc=""
 
try:
 
req = urllib2.Request(url,headers=i_headers)
 
response = urllib2.urlopen(req, None,5)
 
html_doc = response.read() #這不就獲取了要爬取的頁面嘛?
 
except Exception as ex: #看拋出異常了,可能開始反擊我,我開始換代理
 
print "ex=",ex
 
pass
 
self.chance+=1
 
if self.chance>0:
 
if len(self.next_proxy_set)>0:
 
protocol,socket=self.next_proxy_set.pop()
 
proxy= {protocol:socket}
 
print "proxy",proxy
 
print "change proxy success."
 
continue
 
#html_doc = urllib2.urlopen('http://www.xici.net.co/nn/' + str(page)).read()
 
if html_doc !="": #解析爬取的頁面,用的beautifulSoup
 
soup = BeautifulSoup(html_doc,from_encoding="utf8")
 
#print "soup",soup
 
#trs = soup.find('table', id='ip_list').find_all('tr') #得到全部行
 
trs = ""
 
try:
 
trs = soup.find('table').find_all('tr')
 
except:
 
print "error"
 
continue
 
for tr in trs[1:]:
 
tds = tr.find_all('td')
 
ip = tds[0].text.strip() #ip
 
port = tds[1].text.strip() #端口
 
protocol = tds[3].text.strip()
 
#tds = tr.find_all('td')
 
#ip = tds[2].text.strip()
 
#port = tds[3].text.strip()
 
#protocol = tds[6].text.strip()
 
get_time= tds[6].text.strip()
 
#get_time = "20"+get_time
 
check_time = datetime.datetime.strptime(get_time,'%Y-%m-%d %H:%M:%S')
 
temp = time.time()
 
x = time.localtime(float(temp))
 
time_now = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now,入庫時間
 
http_ip = protocol+'://'+ip+':'+port
 
if protocol == 'HTTP' or protocol == 'HTTPS': #只要http協議相關代理,其餘一概不要
 
content=""
 
try: #我就是不放心這個網站,因此爬下來後我又開始檢測代理是否真的有效
 
proxy_support=urllib2.ProxyHandler({protocol:http_ip})
 
# proxy_support = urllib2.ProxyHandler({'http':'http://124.200.100.50:8080'})
 
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
 
urllib2.install_opener(opener)
 
if self.count_errno>50:
 
self.dbcurr.execute("select UID from visited where ID=%s",(ID)) #這是個人數據庫,我取了一個叫uk的東東,這個
 
你不用管,你想檢測拿你要爬取的連接檢測代理吧
 
for uid in self.dbcurr:
 
uk=str(uid[0])
 
ID+=1
 
if ID>50000:
 
ID=0
 
self.count_errno=0
 
test_url="http://yun.baidu.com/pcloud/friend/getfanslist?start=0&query_uk="+uk+"&limit=24" #我用來檢測的連接
 
print "download:",http_ip+">>"+uk
 
req1 = urllib2.Request(test_url,headers=i_headers)
 
response1 = urllib2.urlopen(req1, None,5)
 
content = response1.read()
 
except Exception as ex: #拋異常後的處理
 
#print "ex2=",ex
 
pass
 
self.fail+=1
 
if self.fail>10:
 
self.fail=0
 
break
 
continue
 
if content!="":
 
json_body = json.loads(content)   
 
errno = json_body['errno'] 
 
self.count_errno+=1
 
if errno!=-55: #檢驗該代理是有用的,由於content!="" 而且度娘返回not -55
 
print "success."
 
self.dbcurr.execute('select ID from proxy where IP=%s', (ip)) #開始入庫了
 
y = self.dbcurr.fetchone()
 
if not y:
 
print 'add','%s//:%s:%s' % (protocol, ip, port)
 
self.dbcurr.execute('INSERT INTO proxy(PROTOCOL,IP,PORT,CHECK_TIME,ACQ_TIME) VALUES(%s,%s,%s,%s,%s)',(protocol,ip,port,check_time,time_now))
 
self.dbconn.commit()
 
num-=1
 
if num % 4 ==0:
 
classify="intr" #這個是原來網站的那幾個標籤欄名稱,我是一欄一欄的爬取的
 
if num % 4 ==1:
 
classify="outha"
 
if num % 4 ==2:
 
classify="outtr"
 
if num % 4 ==3:
 
classify="inha"
 
   
 
  if __name__ == '__main__':
 
   
 
  proSer = ProxyServer()
 
  proSer.get_prxy(10000) #爬10000次,單線程,爬個1兩週沒有問題

以上就是本人的代理爬蟲代碼,謝謝閱讀,歡迎轉載。本人建個 qq 羣,歡迎你們一塊兒交流技術, 羣號: 512245829 喜歡微博的朋友關注:轉盤娛樂便可sql

相關文章
相關標籤/搜索