#!/usr/bin/env python # coding=utf-8 import urllib2 ''' urllib2能夠簡單認爲是urllib的加強版,但因爲urllib中提供了urllib2中沒有的函數,由於又不能徹底替代urllib。 二者不能相互替代 ,只能是配合着使用。urllib和urllib2的區別: urllib2經過Request參數來修改Header,也就是能夠經過更改User Agent來假裝瀏覽器。 urllib提供urlencode函數,支持編碼,若是在模擬登錄時,當須要編碼以後的參數,就只能用urllib。 urllib提供了一系列如urlretrieve,quote等函數,而在urllib2中並無。 ''' ''' urllib2模塊:https://docs.python.org/2/library/urllib2.html urllib2.urlopen(url[, data[, timeout[, cafile[, capath[, cadefault[, context]]]]]) urllib2.install_opener(opener) urllib2.build_opener([handler, ...]) exception urllib2.URLError exception urllib2.HTTPError Request類: class urllib2.Request(url[, data][, headers][, origin_req_host][, unverifiable]) Request.add_data(data) Request.get_method() Request.has_data() Request.get_data() Request.add_header(key, val) Request.add_unredirected_header(key, header) Request.has_header(header) Request.get_full_url() Request.get_type() Request.get_host() Request.get_selector() Request.get_header(header_name, default=None) Request.header_items() Request.set_proxy(host, type) Request.get_origin_req_host() Request.is_unverifiable() OpenerDirector類: class urllib2.OpenerDirector OpenerDirector.add_handler(handler) OpenerDirector.open(url[, data][, timeout]) OpenerDirector.error(proto[, arg[, ...]]) BaseHandler類: class urllib2.BaseHandler BaseHandler.add_parent(director) BaseHandler.close() BaseHandler.parent BaseHandler.default_open(req) BaseHandler.protocol_open(req) BaseHandler.unknown_open(req) BaseHandler.http_error_default(req, fp, code, msg, hdrs) BaseHandler.http_error_nnn(req, fp, code, msg, hdrs) BaseHandler.protocol_request(req) BaseHandler.protocol_response(req, response) HTTPDefaultErrorHandler類: class urllib2.HTTPDefaultErrorHandler HTTPRedirectHandler類: class urllib2.HTTPRedirectHandler HTTPRedirectHandler.redirect_request(req, fp, code, msg, hdrs, newurl) HTTPRedirectHandler.http_error_301(req, fp, code, msg, hdrs) HTTPRedirectHandler.http_error_302(req, fp, code, msg, hdrs) HTTPRedirectHandler.http_error_303(req, fp, code, msg, hdrs) HTTPRedirectHandler.http_error_307(req, fp, code, msg, hdrs) HTTPCookieProcessor類: class urllib2.HTTPCookieProcessor([cookiejar]) HTTPCookieProcessor.cookiejar ProxyHandler類: class urllib2.ProxyHandler([proxies]) ProxyHandler.protocol_open(request) HTTPPasswordMgr類: class urllib2.HTTPPasswordMgr HTTPPasswordMgr.add_password(realm, uri, user, passwd) HTTPPasswordMgr.find_user_password(realm, authuri) HTTPPasswordMgrWithDefaultRealm類: class urllib2.HTTPPasswordMgrWithDefaultRealm AbstractBasicAuthHandler類: class urllib2.AbstractBasicAuthHandler([password_mgr]) AbstractBasicAuthHandler.http_error_auth_reqed(authreq, host, req, headers) HTTPBasicAuthHandler類: class urllib2.HTTPBasicAuthHandler([password_mgr]) HTTPBasicAuthHandler.http_error_401(req, fp, code, msg, hdrs) ProxyBasicAuthHandler類: class urllib2.ProxyBasicAuthHandler([password_mgr]) ProxyBasicAuthHandler.http_error_407(req, fp, code, msg, hdrs) AbstractDigestAuthHandler類: class urllib2.AbstractDigestAuthHandler([password_mgr]) AbstractDigestAuthHandler.http_error_auth_reqed(authreq, host, req, headers) HTTPDigestAuthHandler類: class urllib2.HTTPDigestAuthHandler([password_mgr]) HTTPDigestAuthHandler.http_error_401(req, fp, code, msg, hdrs) ProxyDigestAuthHandler類: class urllib2.ProxyDigestAuthHandler([password_mgr]) ProxyDigestAuthHandler.http_error_407(req, fp, code, msg, hdrs) HTTPHandler類: class urllib2.HTTPHandler HTTPHandler.http_open(req) HTTPSHandler類: class urllib2.HTTPSHandler([debuglevel[, context]]) HTTPSHandler.https_open(req) FileHandler類: class urllib2.FileHandler FileHandler.file_open(req) FTPHandler類: class urllib2.FTPHandler FTPHandler.ftp_open(req) CacheFTPHandler類: class urllib2.CacheFTPHandler CacheFTPHandler.setTimeout(t) CacheFTPHandler.setMaxConns(m) UnknownHandler類: class urllib2.UnknownHandler UnknownHandler.unknown_open() HTTPErrorProcessor類: class urllib2.HTTPErrorProcessor HTTPErrorProcessor.http_response() HTTPErrorProcessor.https_response() ''' def test_urllib2(): # 獲取頁面,顯示前100個字節 f = urllib2.urlopen('https://www.baidu.com') print f.read(100) # 設置請求頭 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'} # 建立Request對象,傳入請求頭 req = urllib2.Request(url='https://www.baidu.com', headers=headers) # 傳入Request對象來接收頁面 resp = urllib2.urlopen(req) # 讀取頁面文本 html = resp.read() print '*' * 200 print html print '*' * 200 print resp.getcode() # 響應碼 print resp.geturl() # url print resp.info() # 報頭 if __name__ == '__main__': test_urllib2()
源碼可於github下載:https://github.com/gkimeeq/PythonLearning。html