它是python內置的HTTP請求庫,使用它發送Request。它主要包含如下幾個基本模塊:php
雖然urllib庫是python的內置庫,可是仍然須要導入。導入後能夠直接使用urllib.request.urlopen()函數直接向服務器發送Request。Request中含有data數據時是POST請求,不然爲GET請求。詳細代碼以下:html
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)#urlopen函數形式,主要使用前三個參數 #GET請求 import urllib.request #導入相應的庫 response = urllib.request.urlopen('http://www.baidu.com') #發送Request print(response.read().decode('utf-8')) '''打印相關請求,關於網頁的編碼格式若是常見的仍然沒法編譯,查看網頁源代碼,在head的第一行charset屬性中可能會有相應信息。''' # POST請求 import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') #POST請求比GET多了一個data文件 response = urllib.request.urlopen('http://httpbin.org/post', data=data) print(response.read()) #設置延遲時間 import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)#反應時間0.1s except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout):#判斷錯誤類型 print('TIME OUT')
urlopen()
可以發送Request,可是沒法直接進行更多的設置,如設置請求頭等。這時候能夠先聲明一個Request對象,而後傳入相應的信息,最後將Request對象傳入給urlopen()
.python
from urllib import request, parse #導入相應的包 url = 'http://httpbin.org/post' #網址 headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } #設置請求頭 dict = { 'name': 'Germey' }#設置DataFrom信息 data = bytes(parse.urlencode(dict), encoding='utf8')#將DataFrom信息編譯成二進制流 req = request.Request(url=url, data=data, headers=headers, method='POST')#構建Request類 #若是req中缺乏header時,urllib提供了add_header方法 req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = request.urlopen(req)#傳入urlopen print(response.read().decode('utf-8'))#打印
對於服務器發送的相應體,咱們能夠獲取其類型、狀態碼和響應頭。瀏覽器
import urllib.request response = urllib.request.urlopen('https://www.python.org') print(type(response))#獲取相應類型 print(response.status)# 獲取狀態碼 print(response.getheaders())#獲取響應頭 print(response.getheader('Server'))#獲取相應頭的中的參數 print(response.read().decode('utf-8'))#打印相應體
除了正常的Request內容以外,urllib提供不少附加功能,一般使用handler實現。服務器
設置代理須要首先牀架ProxyHandler,再將其構建爲一個opener,使用open()
方法打開。上文中urlopen()
內部一樣是構建一個opener,而後使用open()
打開網頁。cookie
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9743' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
Cookie用來維持網頁登錄狀態,用於爬取須要登錄的網站。常見的Cookie設置格式以下:socket
import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() #首先建立一個CookieJar類 handler = urllib.request.HTTPCookieProcessor(cookie)#藉助handler處理Cookie opener = urllib.request.build_opener(handler)#構建opener response = opener.open('http://www.baidu.com')#打開網頁 for item in cookie: print(item.name+"="+item.value)#打印出Cookie的值 import http.cookiejar, urllib.request filename = "cookie.txt" cookie = http.cookiejar.MozillaCookieJar(filename)#火狐瀏覽器格式存儲cookie handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True)#保存 import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar()#是用另外一種格式存儲 cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)#加載Cookie handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
python中只定義了兩種錯誤類,URLError和Base#Ear融入,廠用try--except
,捕集判斷錯誤類型。函數
from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.URLError as e: print(e.reason) from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason, e.code, e.headers, sep='\n') except error.URLError as e: print(e.reason) else: print('Request Successfully') import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print('TIME OUT')
這就像一個工具包,裏面有好多功能。工具
#獲取URl信息 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') print(type(result), result) #設置URL信息,若有URL已經存在相應信息,那麼該設置不會起做用 from urllib.parse import urlparse result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https') print(result) #能夠經過指定不存在相應信息方式更改切分結果 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)#設置不存在allow_fragments print(result)
from urllib.parse import urlunparse data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] print(urlunparse(data))
from urllib.parse import urljoin print(urljoin('http://www.baidu.com', 'FAQ.html')) print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2')) print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php')) print(urljoin('http://www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com#comment', '?category=2'))
from urllib.parse import urlencode params = { 'name': 'germey', 'age': 22 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params)#拼接URL print(url)