1、Urllib庫詳解html
一、什麼是Urllibpython
Python內置的HTTP請求庫cookie
urllib.request 請求模塊(模擬實現傳入網址訪問)socket
urllib.error 異常處理模塊(若是出現錯誤,進行捕捉這個異常,而後進行重試和其餘的操做保證程序不會意外的停止)工具
urllib.parse url解析模塊(工具模塊,提供了許多url處理方法,例如:拆分,合併等)post
urllib.robotparser robots.txt解析模塊(主要是用來識別網頁的robots.txt文件,判斷哪些網站是能夠爬的,哪些是不能夠爬的)網站
二、相比Python變化ui
Python2 url
import urllib2spa
response = urllib2.urlopen('http://www.baidu.com')
Python3
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
三、基本用法
Urllib
urlopen
urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,capath=None,cadefault=False,context=None)
方法1
1 import urllib.request 2 3 response = urllib.request.urlopen('http://www.baidu.com') 4 print(response.read().decode('utf-8')) # 獲取相應體的內容,用decode('utf-8')顯示
方法2
import urllib.request import urllib.parse data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8') response = urllib.request.urlopen('http://httpbin.org/post',data=data) # 加了data 是已post形式傳遞 ,不加則是get方式傳遞 print(response.read())
方法3
1 import urllib.request 2 3 response = urllib.request.urlopen('http://httpbin.org/get',timeout=1) 4 print(response.read())
方法4
1 import socket 2 import urllib.request 3 import urllib.error 4 5 6 try: 7 response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1) 8 except urllib.error.URLError as e: 9 if isinstance(e.reason,socket.timeout): 10 print('TIME OUT')
響應
響應類型
1 import urllib.request 2 3 response = urllib.request.urlopen('http://www.baidu.com') 4 print(type(response))
狀態碼、響應頭
1 import urllib.request 2 3 response = urllib.request.urlopen('http://www.python.org') 4 print(response.status) # 獲取狀態碼 5 print(response.getheaders()) # 獲取響應頭 6 print(response.getheader('Server')) # 獲取特定的響應頭,這裏拿 Server舉例
Request
url做爲對象傳給urlopen
1 import urllib.request 2 3 request = urllib.request.Request('https://python.org') # 把url封裝成一個對象 4 response = urllib.request.urlopen(request) # 把對象傳給urlopen同樣能夠訪問 5 print(response.read().decode('utf-8'))
添加request請求的方式
1 from urllib import request,parse 2 3 url = 'http://httpbin.org/post' 4 headers={ 5 'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)', 6 'Host':'httpbin.org' 7 } 8 dict = { 9 'name':'Germey' 10 } 11 data = bytes(parse.urlencode(dict),encoding='utf-8') 12 req = request.Request(url=url,data=data,headers=headers,method='POST') 13 response = request.urlopen(req) 14 print(response.read().decode('utf-8'))
request.add_header()方法
1 from urllib import request,parse 2 3 url = 'http://httpbin.org/post' 4 dict = { 5 'name':'Germey' 6 } 7 data = bytes(parse.urlencode(dict),encoding='utf-8') 8 req = request.Request(url=url,data=data,method='POST') 9 req.add_header('User-Agent','Mozilla/4.0(compatible;MSIE 5.5;Windows NT)') 10 response = request.urlopen(req) 11 print(response.read().decode('utf-8'))
Handler
代理
1 import urllib.request 2 3 # 構建了兩個代理Handler,一個有代理IP,一個沒有代理IP 4 httpproxy_handler = urllib.request.ProxyHandler({"http" : "127.0.0.1:9743"}) 5 nullproxy_handler = urllib.request.ProxyHandler({}) 6 #定義一個代理開關 7 proxySwitch = True 8 # 經過 urllib2.build_opener()方法使用這些代理Handler對象,建立自定義opener對象 9 # 根據代理開關是否打開,使用不一樣的代理模式 10 if proxySwitch: 11 opener = urllib.request.build_opener(httpproxy_handler) 12 else: 13 opener = urllib.request.build_opener(nullproxy_handler) 14 15 request = urllib.request.Request("http://www.baidu.com/") 16 17 # 使用opener.open()方法發送請求才使用自定義的代理,而urlopen()則不使用自定義代理。 18 response = opener.open(request) 19 20 # 就是將opener應用到全局,以後全部的,無論是opener.open()仍是urlopen() 發送請求,都將使用自定義代理。 21 urllib.request.install_opener(opener) 22 # response = urlopen(request) 23 24 print(response.read())
使用選擇的代理構建代理處理器對象
1 import urllib.request 2 3 # 使用選擇的代理構建代理處理器對象 4 proxy_handler = urllib.request.ProxyHandler({ 5 'http':'http://127.0.0.1:9743', 6 'https':'https://127.0.0.1:9743' 7 }) 8 opener = urllib.request.build_opener(proxy_handler) 9 request = urllib.request.Request("http://www.baidu.com") 10 response = opener.open(request) 11 print(response.read())
Cookie維持登錄狀態的一個機制
實現cookie的獲取
import http.cookiejar,urllib.request
1 import http.cookiejar,urllib.request 2 3 cookie = http.cookiejar.CookieJar() 4 handler = urllib.request.HTTPCookieProcessor(cookie) 5 opener = urllib.request.build_opener(handler) 6 response = opener.open('http://www.baidu.com') 7 for item in cookie: 8 print(item.name+"="+item.value)
把cookie保存成一個文本文件
1 import http.cookiejar,urllib.request 2 3 filename = "cookie.txt" 4 cookie = http.cookiejar.MozillaCookieJar(filename) # CookieJar子類的一個對象 MozillaCookieJar() 5 handler = urllib.request.HTTPCookieProcessor(cookie) 6 opener = urllib.request.build_opener(handler) 7 response = opener.open('http://www.baidu.com') 8 cookie.save(ignore_discard=True,ignore_expires=True) # MozillaCookieJar()裏包含了一個save()方法保存成txt文件
Cookie另外一種保存格式 方法2
1 import http.cookiejar,urllib.request 2 3 filename = "cookie.txt" 4 cookie = http.cookiejar.LWPCookieJar(filename) # CookieJar子類的一個對象 LWPCookieJar() 5 handler = urllib.request.HTTPCookieProcessor(cookie) 6 opener = urllib.request.build_opener(handler) 7 response = opener.open('http://www.baidu.com') 8 cookie.save(ignore_discard=True,ignore_expires=True) # LWPCookieJar()裏包含了一個save()方法保存成txt文件
用cookie方法2的方法讀取獲取到的Cookie(LWPCookieJar())
import http.cookiejar,urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8')) # 用文本文件的方式存儲cookie,再讀取出來放在request裏請求訪問網頁,請求的結果就是登錄時候的看到的結果
URL解析
1 # urlparse urllib.parse.urlparse(urlstring,scheme='',allow_fragments=True) 2 # 把url分割成許多部分 3 from urllib.parse import urlparse,urlunparse 4 5 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') 6 print(type(result),result) # 輸出 <class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment') 7 8 # 指定協議類型 9 result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https') 10 print(result) # 輸出 ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment') 11 12 #若是url裏添加了協議,後面分割的就是這個協議方式 13 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https') 14 print(result) # 輸出 ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment') 15 16 #錨點連接 allow_fragments參數 17 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False) 18 print(result) # 將comment拼接到query裏 ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='') 19 20 #把query去掉,直接拼接到path裏 21 result = urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False) 22 print(result) # 輸出 ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='') 23 24 #----------------------------------------------------------------------------------------------------------------------- 25 # urlunparse 將url裏的參數進行拼接成完整的url 26 data = ['http','www.baidu.com','index.html','user','a=6','comment'] 27 print(urlunparse(data)) # 輸出 http://www.baidu.com/index.html;user?a=6#comment 28 29 #----------------------------------------------------------------------------------------------------------------------- 30 # urljoin 後面url裏的字段會覆蓋前面的url 31 from urllib.parse import urljoin 32 print(urljoin('http://www.baidu.com/about.html','https://cuiqincai.com/FAQ.html')) 33 # 輸出 https://cuiqincai.com/FAQ.html 34 35 #----------------------------------------------------------------------------------------------------------------------- 36 from urllib.parse import urlencode 37 38 params = { 39 'name':'germey', 40 'age':22 41 } 42 base_url = 'http://www.baidu.com?' 43 url = base_url + urlencode(params) # 把字典轉換成請求參數 44 print(url) # 輸出 http://www.baidu.com?name=germey&age=22
異常處理
1 # from urllib import request,error # 1,2可用 2 # 打印出異常處理 3 # try: 4 # response = request.urlopen('http://wyh.com/index.html') 5 # except error.URLError as e: 6 # print(e.reason) # 打印出異常原理,保證程序是正常運行的 7 8 # 具體能夠捕捉哪些異常 9 # try: 10 # response = request.urlopen('http://wyh.com/index.html') 11 # except error.HTTPError as e: # HTTPError是子類異常 12 # print(e.reason,e.code,e.headers,sep='\n') # e.headers 打印響應頭的一些信息 13 # except error.URLError as e: # URLError是父類異常 14 # print(e.reason) 15 # else: 16 # print('Request Successfully!') 17 18 # 加一個緣由判斷 19 import socket 20 import urllib.request 21 import urllib.error 22 23 try: 24 response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01) 25 except urllib.error.URLError as e: 26 print(type(e.reason)) # 它是一個類 27 if isinstance(e.reason,socket.timeout): # isinstance()方法判斷是否是匹配的 28 print('TIME OUT!')