說明:僅僅是爲了熟悉正則表達式以及網頁結構,並不同意刷訪問量操做。javascript
首先是要肯定刷的網頁。初版實現了爬取博客園的網頁。下面爲模式匹配的規則,該規則須要根據網頁的url結構進行適當的調整。經過查看獲得當前的博客園的結構以下圖所示:html
所以經過構造匹配串以下所示:javapr=r'href="http://www.cnblogs.com/zpfbuaa/p/(\d+)' rr=re.compile(pr)正則表達式的規則以下圖簡單介紹:python
另外附上正則表達式在線測試工具。快速跳轉至測試工具git
獲取代理ip地址以及端口號。下面推薦代理ip網頁,當時最近發現好像出問題了。不過這些高匿的ip代理提供商有很多免費的ip可供使用,須要的就是將這些ip和port整理起來。正則表達式
下面所給的方法須要根據不一樣的ip提供商的網頁結構進行修改。好比有些ip提供商的網頁並非table結構的,或者結構比較複雜。所以進行正則匹配,以及分析網頁結構(神奇的F12)是很是必要的。json
同時爲了不多個相同用戶端進行請求,則初始化構造一個user_agent_list用來以後網頁訪問中的頭部信息構造。app
user_agent_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', ]
接下來進行代理ip列表的獲取。同時統計出代理ip地址的總個數。
==須要注意下面的正則表達式須要根據網頁結構的不一樣來進行相應的改變==dom
def Get_proxy_ip(): global proxy_list global totalnum proxy_list = [] headers = { 'Host': 'www.xicidaili.com', 'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Accept': r'application/json, text/javascript, */*; q=0.01', 'Referer': r'http://www.xicidaili.com/', } req = urllib2.Request(r'http://www.xicidaili.com/nn/', headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+',html) port_list = re.findall(r'<td>\d+</td>',html) for i in range(len(ip_list)): totalnum+=1 ip = ip_list[i] port = re.sub(r'<td>|</td>', '', port_list[i]) proxy = '%s:%s' %(ip,port) proxy_list.append(proxy) headers = { 'Host': 'www.xicidaili.com', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Accept': r'application/json, text/javascript, */*; q=0.01', 'Referer': r'http://www.xicidaili.com/', } req = urllib2.Request(r'http://www.xicidaili.com/nn/5', headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+', html) port_list = re.findall(r'<td>\d+</td>', html) for i in range(len(ip_list)): totalnum+=1 ip = ip_list[i] port = re.sub(r'<td>|</td>', '', port_list[i]) proxy = '%s:%s' % (ip, port) proxy_list.append(proxy) return proxy_list
對於博客園的博文url列表的構造有不一樣的方法,其中一個爲:將當前網頁上的url進行篩選,獲得全部知足條件的博文,而後將博文的url存儲起來,以待後續的訪問使用。ide
下面針對個人博客進行簡單舉例說明:
首先肯定第一級網頁url:
yeurl=['http://www.cnblogs.com/zpfbuaa/p/?page=1',
'http://www.cnblogs.com/zpfbuaa/p/?page=2',
'http://www.cnblogs.com/zpfbuaa/p/?page=3']
這裏設置的yeurl是做爲第一級網頁,而後從這個列表中獲得全部的博文url(也就是須要讀取上述網頁結構,進行匹配操做)。接下來進行循環,針對每個yeurl裏面的網頁進行讀取篩選操做。以下代碼塊所示:
def main(): list1=[] list2=[] global l global totalblog global ip_list pr=r'href="http://www.cnblogs.com/zpfbuaa/p/(\d+)' rr=re.compile(pr) yeurl=['http://www.cnblogs.com/zpfbuaa/p/?page=1','http://www.cnblogs.com/zpfbuaa/p/?page=2','http://www.cnblogs.com/zpfbuaa/p/?page=3'] for i in yeurl: req=urllib2.Request(i) # proxy_ip = random.choice(proxyIP.proxy_list) # 在proxy_list中隨機取一個ip # print proxy_ip # proxy_support = urllib2.ProxyHandler(proxy_ip) # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) req.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)") cn=urllib2.urlopen(req) f=cn.read() list2=re.findall(rr,f) list1=list1+list2 cn.close() for o in list1: totalblog=totalblog+1 url='http://www.cnblogs.com/zpfbuaa/p/'+o+".html" l.append(url) Get_proxy_ip() print totalnum
- ==上述的list1表示博文的編號,如上如中的/p/6995410, 此時的list1中存儲的都是相似6995410的key==
- ==req.add_header 添加頭部信息,使用庫urllib2進行網頁的請求打開等操做==
前面已經構造了代理ip的列表、用戶請求信息列表、下面的函數傳遞的參數爲博文的url,這裏的博文url在以前也已經構造好保存在l這個list中了。
這裏使用隨機選取代理ip以及用戶請求信息列表,而且爲了不嘗試打開網頁致使過長的時間等待,設置超時時間爲timeout=3,單位爲秒。順便統計出一共刷網頁的次數以及打印出當前訪問的網頁url地址。
代碼以下所示:
def su(url): proxy_ip = random.choice(proxy_list) user_agent = random.choice(user_agent_list) print proxy_ip print user_agent proxy_support = urllib2.ProxyHandler({'http': proxy_ip}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) req = urllib2.Request(url) req.add_header("User-Agent", user_agent) try: c = urllib2.urlopen(req, timeout=3) except Exception as e: print('******打開失敗!******') else: global count count +=1 print('OK!總計成功%s次!'%count) print "當前刷的網址爲" print url sem.release()
設置線程數目最大量爲5,打印博客列表中的數目信息。遍歷保存博文url的列表l而後針對每個其中的url這裏爲變量i,將其傳遞至函數su,並開啓線程進行博客訪問操做。
if __name__ == "__main__": main() print "開始刷訪問量!" print "共計博客個數爲 " print totalblog maxThread=5 sem=threading.BoundedSemaphore(maxThread) while 1: for i in l: sem.acquire() T=threading.Thread(target=su,args=(i,)) T.start()
# coding=utf-8 import urllib2 import re import sys import time import threading import request import random l=[] iparray=[] global totalnum totalnum = 0 global proxy_list proxy_list=[] global count count = 0 totalblog=0 user_agent_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', ] def Get_proxy_ip(): global proxy_list global totalnum proxy_list = [] headers = { 'Host': 'www.xicidaili.com', 'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Accept': r'application/json, text/javascript, */*; q=0.01', 'Referer': r'http://www.xicidaili.com/', } req = urllib2.Request(r'http://www.xicidaili.com/nn/', headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+',html) port_list = re.findall(r'<td>\d+</td>',html) for i in range(len(ip_list)): totalnum+=1 ip = ip_list[i] port = re.sub(r'<td>|</td>', '', port_list[i]) proxy = '%s:%s' %(ip,port) proxy_list.append(proxy) headers = { 'Host': 'www.xicidaili.com', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Accept': r'application/json, text/javascript, */*; q=0.01', 'Referer': r'http://www.xicidaili.com/', } req = urllib2.Request(r'http://www.xicidaili.com/nn/5', headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+', html) port_list = re.findall(r'<td>\d+</td>', html) for i in range(len(ip_list)): totalnum+=1 ip = ip_list[i] port = re.sub(r'<td>|</td>', '', port_list[i]) proxy = '%s:%s' % (ip, port) proxy_list.append(proxy) return proxy_list def main(): list1=[] list2=[] global l global totalblog global ip_list pr=r'href="http://www.cnblogs.com/zpfbuaa/p/(\d+)' rr=re.compile(pr) yeurl=['http://www.cnblogs.com/zpfbuaa/p/?page=1','http://www.cnblogs.com/zpfbuaa/p/?page=2','http://www.cnblogs.com/zpfbuaa/p/?page=3'] for i in yeurl: req=urllib2.Request(i) # proxy_ip = random.choice(proxyIP.proxy_list) # 在proxy_list中隨機取一個ip # print proxy_ip # proxy_support = urllib2.ProxyHandler(proxy_ip) # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) req.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)") cn=urllib2.urlopen(req) f=cn.read() list2=re.findall(rr,f) list1=list1+list2 cn.close() for o in list1: totalblog=totalblog+1 url='http://www.cnblogs.com/zpfbuaa/p/'+o+".html" l.append(url) Get_proxy_ip() print totalnum def su(url): proxy_ip = random.choice(proxy_list) user_agent = random.choice(user_agent_list) print proxy_ip print user_agent proxy_support = urllib2.ProxyHandler({'http': proxy_ip}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) req = urllib2.Request(url) req.add_header("User-Agent", user_agent) try: c = urllib2.urlopen(req, timeout=3) except Exception as e: print('******打開失敗!******') else: global count count +=1 print('OK!總計成功%s次!'%count) print "當前刷的網址爲" print url sem.release() if __name__ == "__main__": main() print "開始刷訪問量!" print "共計博客個數爲 " print totalblog maxThread=5 sem=threading.BoundedSemaphore(maxThread) while 1: for i in l: sem.acquire() T=threading.Thread(target=su,args=(i,)) T.start()
以前的函數
def get_proxy_ip()
能夠從網址http://www.xicidaili.com/nn/中獲得代理ip,可是這樣的侷限性較大。本版本添加新的代理ip地址,經過函數def get_proxy_ip2
實現,以及函數def get_proxy_ip3
。這樣作的目的是熟悉一下python中的正則表達式,另外順便稍微改一下版本而已。具體函數見最後的完整代碼。
以前設置了過時時間,每當使用某個代理ip進行鏈接時,一旦產生超時,只是打印出產生異常而不進行處理,本次修改成將不可用的代理ip進行刪除。也許這樣作不是很好,尤爲是在代理ip地址不多的時候,可能很快就用完了代理ip。所以這裏可能做爲一個改進的地方。
這裏的變化僅僅是提高第一級網頁構造。利用網頁url結構以及循環,構造出第一級網頁的url列表。而後遍歷每一個網頁進行博文url的搜索和篩選保存操做。具體實現見最後的完整代碼。
# coding=utf-8 import urllib2 import re import threading import random proxy_list = [] total_proxy = 0 blog_list = [] total_blog = 0 total_visit = 0 total_remove = 0 user_agent_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', ] # request_list = ['http://www.xicidaili.com/nn/','http://www.xicidaili.com/nn/4','http://www.xicidaili.com/nn/5'] def get_proxy_ip(): global proxy_list global total_proxy request_list = [] headers = { 'Host': 'www.xicidaili.com', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Accept': r'application/json, text/javascript, */*; q=0.01', 'Referer': r'http://www.xicidaili.com/', } for i in range(11, 21): request_item = "http://www.xicidaili.com/nn/" + str(i) request_list.append(request_item) for req_id in request_list: req = urllib2.Request(req_id, headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+', html) port_list = re.findall(r'<td>\d+</td>', html) for i in range(len(ip_list)): total_proxy += 1 ip = ip_list[i] port = re.sub(r'<td>|</td>', '', port_list[i]) proxy = '%s:%s' % (ip, port) proxy_list.append(proxy) return proxy_list def get_proxy_ip2(): global proxy_list global total_proxy request_list = [] headers = { 'Host': 'http://www.66ip.cn/', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Accept': r'application/json, text/javascript, */*; q=0.01', 'Referer': r'http://www.66ip.cn/', } for i in range(1, 50): request_item = "http://www.66ip.cn/" + str(i) + ".html" request_list.append(request_item) for req_id in request_list: req = urllib2.Request(req_id, headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+', html) port_list = re.findall(r'<td>\d+</td>', html) for i in range(len(ip_list)): total_proxy += 1 ip = ip_list[i] port = re.sub(r'<td>|</td>', '', port_list[i]) proxy = '%s:%s' % (ip, port) proxy_list.append(proxy) return proxy_list def get_proxy_ip3(): global proxy_list global total_proxy request_list = [] headers = { # 'Host': 'http://www.youdaili.net/', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', # 'Accept': r'application/json, text/javascript, */*; q=0.01', # 'Referer': r'http://www.youdaili.net/', } # for i in range(2, 6): # request_item = "http://www.youdaili.net/Daili/QQ/36811_" + str(i) + ".html" first = "http://www.youdaili.net/Daili/guowai/" req = urllib2.Request(first, headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') request_id = re.findall(r'href="http://www.youdaili.net/Daili/guowai/(\d+).html', html) print request_id for item in request_id: request_item = "http://www.youdaili.net/Daili/guowai/" + str(item) + ".html" request_list.append(request_item) print request_list # request_item="http://www.youdaili.net/Daili/guowai/4821.html" # request_list.append(request_item) # for i in range(2, 6): # request_item = "http://www.youdaili.net/Daili/QQ/36811_" + str(i) + ".html" # request_list.append(request_item) for req_id in request_list: req = urllib2.Request(req_id, headers=headers) response = urllib2.urlopen(req) html = response.read().decode('utf-8') ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+', html) port_list = re.findall(r':+\d*@HTTP#', html) for i in range(len(ip_list) - 20): total_proxy += 1 ip = ip_list[i] print port_list[i] tmp = re.sub(r':', '', port_list[i]) port = re.sub(r'@HTTP#', '', tmp) proxy = '%s:%s' % (ip, port) print port proxy_list.append(proxy) print proxy_list return proxy_list def create_blog_list(): list1 = [] list2 = [] global blog_list global total_blog pr = r'href="http://www.cnblogs.com/zpfbuaa/p/(\d+)' rr = re.compile(pr) my_blog = [] # my_blog = ['http://www.cnblogs.com/zpfbuaa/p/?page=1', 'http://www.cnblogs.com/zpfbuaa/p/?page=2', # 'http://www.cnblogs.com/zpfbuaa/p/?page=3'] for i in range(1, 25): blogitem = "http://www.cnblogs.com/zpfbuaa/p/?page=" + str(i) my_blog.append(blogitem) for item in my_blog: req = urllib2.Request(item) # proxy_ip = random.choice(proxyIP.proxy_list) # 在proxy_list中隨機取一個ip # print proxy_ip # proxy_support = urllib2.ProxyHandler(proxy_ip) # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) req.add_header("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)") cn = urllib2.urlopen(req) f = cn.read() list2 = re.findall(rr, f) list1 = list1 + list2 cn.close() for blog_key in list1: total_blog = total_blog + 1 url = 'http://www.cnblogs.com/zpfbuaa/p/' + blog_key + ".html" blog_list.append(url) def main(): create_blog_list() # get_proxy_ip() # get_proxy_ip2() get_proxy_ip3() def sorry_to_visit_you(url): global total_visit global proxy_list global total_proxy global total_remove proxy_ip = random.choice(proxy_list) user_agent = random.choice(user_agent_list) print proxy_ip print user_agent proxy_support = urllib2.ProxyHandler({'http': proxy_ip}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) req = urllib2.Request(url) req.add_header("User-Agent", user_agent) try: c = urllib2.urlopen(req, timeout=10) except Exception as e: proxy_list.remove(proxy_ip) total_proxy -= 1 total_remove += 1 print "刪除ip" print proxy_ip print "當前移除代理ip個數爲:%d" % total_remove print('******打開失敗!******') else: total_visit += 1 print('OK!總計成功%d次!' % total_visit) print "當前刷的網址爲%s" % url print "當前剩餘代理ip個數爲:%d" % total_proxy sem.release() if __name__ == "__main__": main() print "開始刷訪問量!" print "共計代理總數爲%s個!" % total_proxy print "共計博客個數爲%s個!" % total_blog max_thread = 5 sem = threading.BoundedSemaphore(max_thread) while 1: for blog_url in blog_list: sem.acquire() T = threading.Thread(target=sorry_to_visit_you, args=(blog_url,)) T.start()