urllib3是一個功能強大,對SAP健全的 HTTP客戶端,許多Python生態系統已經使用了urllib3。json
sudo pips install urllib3
經過urllib3訪問網頁,首先須要構造一個PoolManager實例對象用於處理與線程池的鏈接以及線程安全的全部細節,而後經過request()方法來發送請求api
import urllib3 #建立PoolManager對象,用於處理與線程池的鏈接以及線程安全的全部細節 http = urllib3.PoolManager() #對須要爬取的網頁發送請求 resp = http.request(method,url,...)
method和url兩個參數必選,而且method須要大寫瀏覽器
import urllib3 #建立PoolManager對象,用於處理與線程池的鏈接以及線程安全的全部細節 http = urllib3.PoolManager() #對須要爬取的網頁發送請求 resp = http.request('GET','https://www.baidu.com/') print(resp.data.decode())#響應數據 print(resp.headers)#響應頭信息 print(resp.status)#狀態碼 resp.release_conn()#釋放這個http鏈接
能夠傳入headers
參數(dict類型)來增長請求頭中的headers信息。能夠利用fields參數傳遞查詢參數(dict類型),注意url後面的'?‘必定不能帶上安全
import urllib3 http = urllib3.PoolManager() kw = {"wd":"長城"} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} resp = proxy.request('GET','https://www.baidu.com/s',fields=kw,headers = headers) print(resp.data.decode())
import urllib3 http = urllib3.PoolManager() #須要提交的數據 data = {'word':'hello'} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} response = http.request('POST','http://httpbin.org/post',fields = data,headers = headers) print(response.data.decode())
import urllib3 import json http = urllib3.PoolManager() url = 'https://openapi.vmall.com/mcp/offlineshop/getShopList' data = { "portal":2,"lang":"zh-CN","country":"CN","brand":1,"province":"山西","city":"太原","pageNo":1,"pageSize":20 } # 將字典類型數據序列化成json字符串 json_data = json.dumps(data) #headers中設置Conten-Type爲application/json headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'Content-Type':'application/json' } resp = http.request('POST',url,body = json_data,headers = headers) print(resp.data.decode())
注意:body參數和fields參數不能同時使用app
import urllib3 http = urllib3.PoolManager() #打開文件test.txt with open('test.txt','r') as f: file_data = f.read() headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} #三元元組的方式傳入 resp = http.request('POST','http://httpbin.org/post',fields={'filefield':('test.txt',file_data,'text/plain')},headers=headers) #二元元組的方式傳入 #resp = http.request('POST','http://httpbin.org/post',fields={'filefield':('test.txt',file_data)},headers=headers) print(resp.data.decode('unicode_escape'))
import urllib3 http = urllib3.PoolManager() with open('test.jpg','rb') as f: binary_data = f.read() headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",'Content-Type':'image/jpeg'} resp = http.request('POST','http://httpbin.org/post',body = binary_data,headers = headers) print(resp.data.decode())
resp = http.request('GET','http://httpbin.org/delay/3',timeout = 4.0)
#設置總超時時間爲3.0秒,鏈接超時不超過1.5秒,讀取超時不超過2.0秒 resp = http.request('GET','http://httpbin.org/delay/3',timeout = urllib3.Timeout(total=3.0,connect=1.5,read=2.0))
http = urllib3.PoolManager(timeout = urllib3.Timeout(total=3.0,connect=1.5,read=2.0)) #http = urllib3.PoolManager(timeout = 4.0)
#設置請求重試次數10次 resp = http.request('GET','http://httpbin.org/ip',retries = 10)
#同時關閉請求重試和重定向 resp = http.request('GET','http://httpbin.org/redirect/1',retries = False)
#僅關閉重定向 resp = http.request('GET','http://httpbin.org/redirect/1',redirect = False)
resp = http.request('GET','http://httpbin.org/redirect/3',retries = urllib3.Retry(3,redirect = 2))
http = urllib3.PoolManager(retries = urllib3.Retry(3,redirect=2)) #http = urllib3.PoolManager(retries = False)
import urllib3 #建立ProxyManager對象 proxy_http = urllib3.ProxyManager('https://175.42.122.96:9999') headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} #利用代理對須要爬取的網頁發送請求 resp = proxy_http.request('GET','https://www.baidu.com/',headers=headers) print(resp.data.decode())#響應數據 print(resp.headers)#響應頭信息 print(resp.status)#狀態碼 resp.release_conn()#釋放這個http鏈接
sudo pip3 install certifi
import urllib3 import certici #開啓ssl證書自動驗證 http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where()) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} resp = http.request('GET','https://www.baidu.com/',headers=headers) print(resp.data.decode())
import urllib3 import certifi http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs='/etc/ssl/certs/ca-certificates.crt')