目錄html
urllib
庫的學習記錄說明:urllib是一個收集幾個模塊的以處理URL的包python
urilib.request
用於打開和閱讀URLurllib.error
包含由...提出的例外urllib.parse
用於解析URLurllib.robotparser
用於解析 robots.txt
文件urllib.request
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
url
: 須要打開的網址data
:post提交的數據timeout
:設置網站的訪問超時時間get
請求基本的使用nginx
import urllib.request response = urllib.request.urlopen('http://www.baidu.com') # response.read() -- > bytes print(response.read().decode('utf-8'))
post
請求# 1. 對 data 進行處理 import urllib.parse data = bytes(urllib.parse.urlencode({'hello':'word'}),encoding='utf-8') # 處理後的結果 b'hello=word' # 2. 使用request.urlopen() 發送請求 response = urllib.request.urlopen('http://www.httpbin.org/post',data=data) response._method # 獲取請求方法 'POST' response.url # 獲取請求的url 'http://www.httpbin.org/post'
import urllib.request response = urllib.request.urlopen('http://httpbin.org/get',timeout=1) print(response.read()) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1) except urllib.error.URLError as e: if isinstance(e.reason,socket.timeout): print('TIME OUT')
Reponse
print(dir(response) ['__abstractmethods__', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_checkClosed', '_checkReadable', '_checkSeekable', '_checkWritable', '_check_close', '_close_conn', '_get_chunk_left', '_method', '_peek_chunked', '_read1_chunked', '_read_and_discard_trailer', '_read_next_chunk_size', '_read_status', '_readall_chunked', '_readinto_chunked', '_safe_read', '_safe_readinto', 'begin', 'chunk_left', 'chunked', 'close', 'closed', 'code', 'debuglevel', 'detach', 'fileno', 'flush', 'fp', 'getcode', 'getheader', 'getheaders', 'geturl', 'headers', 'info', 'isatty', 'isclosed', 'length', 'msg', 'peek', 'read', 'read1', 'readable', 'readinto', 'readinto1', 'readline', 'readlines', 'reason', 'seek', 'seekable', 'status', 'tell', 'truncate', 'url', 'version', 'will_close', 'writable', 'write', 'writelines' ]
response.status # 200 response.getcode() # 200 response.code # 200 response.url # 'http://www.baidu.com' response._method # 'GET' response.getheaders() [ ('Access-Control-Allow-Credentials', 'true'), ('Access-Control-Allow-Origin', '*'), ('Content-Type', 'application/json'), ('Date', 'Fri, 14 Jun 2019 02:33:18 GMT'), ('Referrer-Policy', 'no-referrer-when-downgrade'), ('Server', 'nginx'), ('X-Content-Type-Options', 'nosniff'), ('X-Frame-Options', 'DENY'), ('X-XSS-Protection', '1; mode=block'), ('Content-Length', '226'), ('Connection', 'Close') ] response.getheader('Server') # nginx
Request
源碼git
class Request: def __init__(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None): In [3]: url = 'https://www.baidu.com/' In [4]: req = urllib.request.Request(url=url) In [5]: dir(req) Out[5]: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_data', '_full_url', '_parse', '_tunnel_host', 'add_header', 'add_unredirected_header', 'data', 'fragment', 'full_url', 'get_full_url', 'get_header', 'get_method', 'has_header', 'has_proxy', 'header_items', 'headers', 'host', 'origin_req_host', 'remove_header', 'selector', 'set_proxy', 'type', 'unredirected_hdrs', 'unverifiable']
data
post數據的處理headers={}
構造請求頭信息import urllib.request import urllib.parse url = 'http://httpbin.org/post' # 請求頭 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} headers['Host'] = 'httpbin.org' # data -- bytes dict = {'name': 'Germey'} data = urllib.parse.urlencode(dict).encode('utf-8') # 實例化請求對象 傳入參數 request = urllib.request.Request(url=url, data=data, headers=headers, method='POST') print(request) # <urllib.request.Request object at 0x000002404A9689E8> ############################################################## # --------以上只不過是建立請求對象但並無發送請求------------ ############################################################# # 發送請求對象並返回響應對象 response = urllib.request.urlopen(request) print(response) # <http.client.HTTPResponse object at 0x000002404AFBC358>
# 增長header from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 'Host':'httpbin.org' } # 構造POST表格 dict = { 'name':'Germey' } data = bytes(parse.urlencode(dict),encoding='utf8') req = request.Request(url=url,data=data,headers=headers,method='POST') response = request.urlopen(req) print(response.read()).decode('utf-8')
add_header
方法添加import urllib.request req = urllib.request.Request('http://www.example.com/') req.add_header('Referer', 'http://www.python.org/') # Customize the default User-Agent header value: req.add_header('User-Agent', 'urllib-example/0.1 (Contact: . . .)') r = urllib.request.urlopen(req)
import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open("http://www.baidu.com") for item in cookie: print(item.name+"="+item.value) # 保存cooki爲文本 import http.cookiejar, urllib.request filename = "cookie.txt" # 保存類型有不少種 ## 類型1 cookie = http.cookiejar.MozillaCookieJar(filename) ## 類型2 cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open("http://www.baidu.com") # 使用相應的方法讀取 import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open("http://www.baidu.com")
from urllib import request url = 'http://httpbin.org/ip' proxy = {'http':'218.18.232.26:80','https':'218.18.232.26:80'} # 建立代理處理器 proxies = request.ProxyHandler(proxy) # 建立opener對象 opener = request.build_opener(proxies) resp = opener.open(url) print(resp.read().decode())
urllib.parse
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "urlencode", "parse_qs", "parse_qsl", "quote", "quote_plus", "quote_from_bytes", "unquote", "unquote_plus", "unquote_to_bytes", "DefragResult", "ParseResult", "SplitResult", "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
urlparse
urlunpars
quote/quote_plus
unquote/unquote_plus
urljoin
urlencode
parse_qs/parse_qsl
url
解析 urlparse
In [8]: from urllib.parse import urlparse In [9]: o = urlparse('https://docs.python.org/3/library/urllib.parse.html') ''' 將url分紅六個部分,返回一個包含6個字符串項目的元組: 協議 : scheme 位置 : netloc 路徑 : path 參數 ..... 查詢 判斷 輸出結果以下 ''' In [10]: o Out[10]: ParseResult(scheme='https', netloc='docs.python.org', path='/3/library/urllib.parse.html', params='', query='', fragment='') In [11]: dir(o) Out[11]: ['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '_asdict', '_encoded_counterpart', '_fields', '_hostinfo', '_make', '_replace', '_source', '_userinfo', 'count', 'encode', 'fragment', 'geturl', 'hostname', 'index', 'netloc', 'params', 'password', 'path', 'port', 'query', 'scheme', 'username'] In [12]: o.path Out[12]: '/3/library/urllib.parse.html' In [13]: o.scheme Out[13]: 'https' In [14]: o.geturl() Out[14]: 'https://docs.python.org/3/library/urllib.parse.html' url = "https://docs.python.org/3.5/library/urllib.parse.html?highlight=parse#module-urllib.parse" result = parse.urlparse(url) print(result.query) # 獲取返回結果參數內容 print(parse.parse_qs(result.query)) # 結果轉換成字典 print(parse.parse_qsl(result.query)) # 結果轉換成列表
url
解析 urlunpars
In [15]: o Out[15]: ParseResult(scheme='https', netloc='docs.python.org', path='/3/library/urllib.parse.html', params='', query='', fragment='') In [16]: from urllib.parse import urlunparse In [17]: urlunparse(o) Out[17]: 'https://docs.python.org/3/library/urllib.parse.html' # list(o) In [18]: urlunparse(list(o)) Out[18]: 'https://docs.python.org/3/library/urllib.parse.html'
url
解析 parse_qs/parse_qsl
In [52]: parse_qs('https://i.cnblogs.com/EditPosts.aspx?opt=1') Out[52]: {'https://i.cnblogs.com/EditPosts.aspx?opt': ['1']} In [53]: parse_qsl('https://i.cnblogs.com/EditPosts.aspx?opt=1') Out[53]: [('https://i.cnblogs.com/EditPosts.aspx?opt', '1')]
url
解析 quote/unquote
Help on function quote in module urllib.parse: quote(string, safe='/', encoding=None, errors=None) quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," Each of these characters is reserved in some component of a URL, but not necessarily in all of them. By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. string and safe may be either str or bytes objects. encoding and errors must not be specified if string is a bytes object. The optional encoding and errors parameters specify how to deal with non-ASCII characters, as accepted by the str.encode method. By default, encoding='utf-8' (characters are encoded with UTF-8), and errors='strict' (unsupported characters raise a UnicodeEncodeError). In [26]: search = '搜索內容' In [27]: quote(search) Out[27]: '%E6%90%9C%E7%B4%A2%E5%86%85%E5%AE%B9'
url
反向解析 unquote/unquote_plus
In [41]: from urllib import parse In [42]: parse.quote('a&b/c') Out[42]: 'a%26b/c' # 未編碼斜線 In [43]: parse.quote_plus('a&b/c') Out[43]: 'a%26b%2Fc' # 編碼了斜線
url
urlencode
In [44]: query = { ...: ...: 'name': 'Lee', ...: ...: 'age': 19, ...: ...: } In [45]: type(query) Out[45]: dict In [46]: parse.urlencode(query) Out[46]: 'name=Lee&age=19'
GET
的請求方式>>> import urllib.request >>> import urllib.parse >>> params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) >>> url = "http://www.musi-cal.com/cgi-bin/query?%s" % params >>> with urllib.request.urlopen(url) as f: ... print(f.read().decode('utf-8'))
POST
的請求方式>>> import urllib.request >>> import urllib.parse >>> data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) >>> data = data.encode('ascii') >>> with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f: ... print(f.read().decode('utf-8'))
本文參考連接github
官方教程: https://docs.python.org/3/library/urllib.request.htmljson
GitHub源碼: https://github.com/python/cpython/blob/3.7/Lib/urllib/request.pycookie
博主Coder : https://www.cnblogs.com/zhaof/p/6910871.htmlapp
博主Hoptop : https://www.jianshu.com/u/9ea40b5f607assh
博主支付寶 : http://www.pianshen.com/article/2667231307/socket
腳本之家 : https://www.jb51.net/article/128540.htm