Urllib是Python內置的HTTP請求庫,包括了4個模塊:html
一、urllib.request 的urlopen()python
import urllib.request response = urllib.request.urlopen('http://www.baidu.com') print(response.read().decode('utf8'))
response = urllib.request.urlopen('http://httpbin.org/get',timeout=1) # 設置超時時間 print(response.read())
import urllib.request import socket import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1) except urllib.error.URLError as e: if isinstance(e.reason,socket.timeout): print('TIME OUT')
二、響應nginx
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
print(type(response)) # <class 'http.client.HTTPResponse'>
import urllib.request response = urllib.request.urlopen('http://www.python.org') print(response.status) # 200 print(response.getheaders()) print(response.getheader('Server')) #nginx
三、urllib.request.Request()git
from urllib import request,parse url = 'http://httpbin.org/post' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Host':'httpbin.org' } dict = { 'name':'Germey' } data = bytes(parse.urlencode(dict),encoding='utf8') req = request.Request(url=url,data=data,headers=headers,method='POST') # 是一個object response = request.urlopen(req) print(response.read().decode('utf8'))
利用request.Request()方法能夠靈活的構造要請求的內容和類型github
還能夠利用.add_header()方法來添加headersjson
from urllib import request,parse url = 'http://httpbin.org/post' dict = { 'name':'Germey' } data = bytes(parse.urlencode(dict),encoding='utf8') req = request.Request(url=url,data=data,method='POST') # 是一個object req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) ') response = request.urlopen(req) print(response.read().decode('utf8'))
四、urllib.request.ProxyHandler()實現代理cookie
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http':'http://127.0.0.1:9743', 'https':'https://127.0.0.1:9743', }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
五、cookiessession
import http.cookiejar,urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) # 也是代理的一種新式 response = opener.open('http://www.baidu.com') for item in cookie: print(item.name + '='+ item.value) ''' BAIDUID=A980763F2538BCB3FDA9E5BC979758CB:FG=1 BIDUPSID=A980763F2538BCB3FDA9E5BC979758CB H_PS_PSSID=1453_26909_21094_18559_26350 PSTM=1533972705 BDSVRTM=0 BD_HOME=0 delPer=0 '''
把cookie保存成文本文件socket
import http.cookiejar,urllib.request filename = 'cookie.txt' cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) # 也是代理的一種新式 response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True,ignore_expires=True)
六、異常處理post
from urllib import request,error try: response = urllib.request.urlopen('http://cuiqingcai/index.html') except error.URLError as e: print(e.reason) #[Errno 11004] getaddrinfo failed
from urllib import request,error try: response = urllib.request.urlopen('http://cuiqingcai/index.html') except error.HTTPError as e: print(e.reason,e.code,e.headers,sep='\n') #[Errno 11004] getaddrinfo failed except error.URLError as e: print(e.reason) else: print('Request successful')
先捕獲HTTPError,再捕獲URLError。
七、URL解析
from urllib.parse import urlparse result = urlparse('https://www.suning.com/?vip_frm=super_nav_vip') print(type(result),result) ''' <class 'urllib.parse.ParseResult'> ParseResult(scheme='https', netloc='www.suning.com', path='/', params='', query='vip_frm=super_nav_vip', fragment='') ''' # 把url進行標準的拆分
八、urlencode 能夠將字典轉換爲get請求的參數
from urllib.parse import urlencode
params = {
'name':'germay',
'age':'12'
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url) #http://www.baidu.com?name=germay&age=12
requests庫相比urllib庫來講,比urllib庫要方便許多,先來簡單體驗一下
import requests response = requests.get('http://www.baidu.com') print(response) # 200 print(response.status_code) print(type(response.text),response.text) #<class 'str'> print(response.cookies) # <RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
一、requests庫的各類請求方式
import requests print(requests.get('http://httpbin.org/get')) print(requests.post('http://httpbin.org/post')) print(requests.delete('http://httpbin.org/delete')) print(requests.put('http://httpbin.org/put')) print(requests.options('http://httpbin.org/get')) print(requests.head('http://httpbin.org/get'))
二、get請求
import requests response = requests.get('http://httpbin.org/get') print(response.text) ''' { "args": {}, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Connection": "close", "Host": "httpbin.org", "User-Agent": "python-requests/2.18.4" }, "origin": "113.59.106.145", "url": "http://httpbin.org/get" } '''
三、帶參數的get請求
import requests response = requests.get('http://httpbin.org/get?name=germay&age=22') print(response.text) ''' { "args": { "age": "22", "name": "germay" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Connection": "close", "Host": "httpbin.org", "User-Agent": "python-requests/2.18.4" }, "origin": "113.59.106.145", "url": "http://httpbin.org/get?name=germay&age=22" } '''
import requests data = { 'name':'germay', 'age':22 } response = requests.get('http://httpbin.org/get',params=data) print(response.text)
四、json解析
import requests import json response = requests.get('http://httpbin.org/get') print(response.json()) print(json.loads(response.text)) ''' {'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'}, 'origin': '113.59.106.145', 'url': 'http://httpbin.org/get'} {'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'}, 'origin': '113.59.106.145', 'url': 'http://httpbin.org/get'} '''
response.json()和json.loads(response.text)同樣。
五、獲取二進制數據
import requests response = requests.get('https://github.com/favicon.ico') print(type(response.text)) #<class 'str'> print(type(response.content)) #<class 'bytes'>
.text是str類型,.content是bytes類型
import requests response = requests.get('https://github.com/favicon.ico') print(type(response.text)) #<class 'str'> print(type(response.content)) #<class 'bytes'> with open('favicon.ico','wb') as f: f.write(response.content)
六、添加headers
不加header爬取知乎的結果
import requests response = requests.get('https://www.zhihu.com/explore') print(response.text) ''' <html> <head><title>400 Bad Request</title></head> <body bgcolor="white"> <center><h1>400 Bad Request</h1></center> <hr><center>openresty</center> </body> </html> '''
加headers爬取知乎的結果:是完整的這個頁面
import requests headers = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } response = requests.get('https://www.zhihu.com/explore',headers=headers) print(response.text)
因此,在網頁爬取時,添加headers是很是必要的,不加的話極可能被禁掉。
七、響應
response屬性
import requests headers = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } response = requests.get('https://www.zhihu.com/explore',headers=headers) print(response.status_code) # 200 print(response.headers) print(response.cookies) print(response.url) #https://www.zhihu.com/explore print(response.history)
八、文件上傳
import requests files = {"file":open('favicon.ico','rb')} response = requests.post('http://httpbin.org/post',files=files) print(response.text)
九、獲取cookie
import requests response = requests.get('https://www.baidu.com') print(response.cookies) for k,v in response.cookies.items(): print(k+'='+v)
十、會話維持
import requests s = requests.session() s.get('http://httpbin.org/cookies/set/number/1122112') response = s.get('http://httpbin.org/cookies') print(response.text) ''' { "cookies": { "number": "1122112" } } '''
十一、證書驗證
import requests response = requests.get('https://www.12306.cn') print(response.status_code) #requests.exceptions.SSLError
上述代碼訪問12306直接報SSLError錯誤
若是加上verify=False,就能夠正常爬取
import requests from requests.packages import urllib3 urllib3.disable_warnings() # 消除警告的做用 response = requests.get('https://www.12306.cn',verify=False) print(response.status_code) #requests.exceptions.SSLError
十二、代理
import requests proxy = ({ 'http':'http://127.0.0.1:9743', 'https':'https://127.0.0.1:9743', }) response = requests.get('https://www.taobao.com',proxies=proxy) print(response.status_code)
1三、超時設置
import requests response = requests.get('https://www.taobao.com',timeout=1) print(response.status_code)
1四、認證設置
對於一些須要登陸才能訪問的網站,須要進行認證處理
import requests response = requests.get('https://127.27.34.24:9001',auth=('user','123')) print(response.status_code)
1五、異常處理
import requests from requests.exceptions import ReadTimeout,HTTPError,RequestException try: response = requests.get('https://www.taobao.com',timeout=1) print(response.status_code) except ReadTimeout: print('Timeout') except HTTPError: print('Http error') except RequestException: print('error')