本文主要用到python3自帶的urllib模塊編寫輕量級的簡單爬蟲。至於怎麼定位一個網頁中具體元素的url可自行百度火狐瀏覽器的firebug插件或者谷歌瀏覽器的自帶方法。python
一、訪問一個網址json
re=urllib.request.urlopen('網址‘)瀏覽器
打開的也能夠是個urllib.request.Request對象,後邊也能夠跟數據參數,當有傳入數據時會自動變爲POST請求;服務器
二、urllib.request.Request(url,data=None,headers={})對象屬性和方法cookie
1 full_url 2 type 3 host 4 data 5 selector 6 method 7 get_method() 8 add_header(key,val) 9 add_unredirected_header(key,header) 10 has_header(header) 11 remove_header(header) 12 get_full_url(header) 13 set_proxy(host,type) 14 get_header(header_name,default=None)
15 header_items()
三、已鏈接對象的可用方法:session
1 re.read() 讀取內容,想要將內容保存下來,需先新建一個相應格式的文件,再將讀取到的內容寫入到這個文件內便可; 2 re.geturl() 可取得已打開對象的url地址; 3 re.info() 可取得響應服務器的信息; 4 re.getcode() 可取得響應狀態碼; 5 urllib.parse.urlencode() 將一個存儲post數據的字典轉換成打開網頁所須要的數據格式;
可用json.loads()將文本轉換成鍵值對app
可在傳地址時將header以一個字典數據的形式傳入,以隱藏本身的訪問方式;也可用re.add_header('') 的方式進行追加;框架
四、當知道一個文件的url時可用此方法直接下載保存到本地scrapy
urllib.request.urlretrieve('http://wx1.sinaimg.cn/mw600/9bbc284bgy1ffkuafn4xtj20dw0jgh08.jpg','bc.jpg')ide
五、登陸功能的實現(post)
(1)利用session保留登陸狀態
1 login_data = { 2 '_xsrf': getXSRF(baseurl), 3 'password': password, 4 'remember_me': 'true', 5 'email': email, 6 session = requests.session() 7 content = session.post(url, headers = headers_base, data = login_data) 8 s = session.get("http://www.zhihu.com", verify = False) 9 print s.text.encode('utf-8')
(2)利用cookie進行登陸
1 post = { 2 'ua':self.ua, 3 'TPL_checkcode':'', 4 'CtrlVersion': '1,0,0,7', 5 'TPL_password':'', 6 } 7 #將POST的數據進行編碼轉換 8 postData = urllib.urlencode(post) 9 cookie = cookielib.LWPCookieJar() 10 cookieHandler = urllib2.HTTPCookieProcessor(cookie) 11 opener = urllib2.build_opener(cookieHandler, urllib2.HTTPHandler) 12 #第一次登陸獲取驗證碼嘗試,構建request 13 request = urllib2.Request(loginURL,postData,loginHeaders) 14 #獲得第一次登陸嘗試的相應 15 response = self.opener.open(request) 16 #獲取其中的內容 17 content = response.read().decode('gbk') 18
網站經常使用的編碼方式有utf8,gbk,gb2132,gb18030等
六、代理的使用
同一個Ip設備在短期內訪問一個服務器次數過多會被服務器禁止訪問,因此不少時候咱們都須要用天代理來幫助咱們解決這個問題。方法以下:
1 proxy_support = urllib.request.ProxyHandler({類型:代理ip和端口號}) 2 opner = urllib.request.build_opener(proxy_suppoert) 3 urllib.request.install_opener(opener) #可選安裝 4 opener.open(url) #或直接調用opener代理
注:如想實現更復雜的可以使用更全面的scrapy框架。
附:本身寫的一個驗證網上代理的有效性的爬蟲,此爬蟲先從網站上獲取代理的地址,而後使用這個代理來訪問百度,驗證是否能獲得百度的網頁,如能則將此代理地址保存。
1 import threading,time,pickle,re 2 import urllib.request 3 4 class ProxyCheck(threading.Thread): 5 def __init__(self,proxylist): 6 threading.Thread.__init__(self) 7 self.proxylist = proxylist 8 self.timeout = 5 9 self.test_url = 'http://www.baidu.com' 10 self.test_str = '11000002000001' 11 self.checkedProxyList = [] 12 13 def checkProxy(self): 14 cookies = urllib.request.HTTPCookieProcessor() 15 for proxy in self.proxylist: 16 proxy_handler = urllib.request.ProxyHandler({'http':r'%s://%s:%s' %(proxy[0],proxy[1],proxy[2])}) 17 opener = urllib.request.build_opener(cookies,proxy_handler) 18 opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' 19 '(KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36')] 20 urllib.request.install_opener(opener) 21 t1 = time.time() 22 try: 23 req = urllib.request.urlopen(self.test_url,timeout=self.timeout) 24 result = req.read().decode('utf-8') 25 timeused = time.time() - t1 26 pos = result.find(self.test_str) 27 if pos > 1: 28 self.checkedProxyList.append((proxy[0],proxy[1],proxy[2],proxy[3],timeused)) 29 print((proxy[0],proxy[1],proxy[2],proxy[3],timeused)) 30 else: 31 continue 32 except: 33 continue 34 # def sort(self): 35 # sorted(self.checkedProxyList,cmp=lambda x,y:cmp(x[4],y[4])) 36 def save(self,filename): 37 with open("%s.txt"%filename,'w') as f: 38 for proxy in self.checkedProxyList: 39 f.write("{}\t{}:{}\t{}\t{}\n".format(*proxy)) 40 with open("%s.pickle"%filename,'wb') as fb: 41 pickle.dump(self.checkedProxyList,fb) 42 43 def run(self): 44 self.checkProxy() 45 self.save("checked-50") 46 47 48 class xiciProxy: 49 def __init__(self): 50 self.alllist = [] 51 def grep(self,url): 52 # req = urllib.request.Request(url) 53 # req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' 54 # '(KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36') 55 56 57 result1 = urllib.request.urlopen(req) 58 result2 = result1.read().decode('utf-8') 59 60 regex = r"<td>(\d+.\d+.\d+.\d+)</td>\n.*?" \ 61 r"<td>(\d+)</td>\n.*?" \ 62 r"\n.*?" \ 63 r"<a href=.*?>(.*?)</a>\n.*?" \ 64 r"\n.*?" \ 65 r"\n.*?" \ 66 r"<td>(HTTPS?)</td>" 67 get = re.findall(regex,result2) 68 proxylist = [] 69 for i in get: 70 proxylist.append((i[3],i[0],i[1],i[2])) 71 return proxylist 72 def save(self,filename): 73 with open("%s.txt"%filename,'w') as f: 74 for proxy in self.alllist: 75 f.write("{}\t{}:{}\t{}\n".format(*proxy)) 76 with open("%s.pickle"%filename,'wb') as fb: 77 pickle.dump(self.alllist,fb) 78 def run(self): 79 for i in range(51,1951): 80 url = "http://www.xicidaili.com/nn/{}".format(i) 81 print(url) 82 proxylist = self.grep(url) 83 self.alllist += proxylist 84 if i % 50 == 0: 85 self.save("xiciproxy-{}".format(i)) 86 self.alllist = [] 87 88 with open("xiciproxy-50.pickle","rb") as fb: 89 proxylist = pickle.load(fb) 90 ProxyCheck(proxylist).run()