urllib2模塊

#!/usr/bin/env python
# coding=utf-8

import urllib2

'''
urllib2能夠簡單認爲是urllib的加強版,但因爲urllib中提供了urllib2中沒有的函數,由於又不能徹底替代urllib。
二者不能相互替代 ,只能是配合着使用。urllib和urllib2的區別:
urllib2經過Request參數來修改Header,也就是能夠經過更改User Agent來假裝瀏覽器。
urllib提供urlencode函數,支持編碼,若是在模擬登錄時,當須要編碼以後的參數,就只能用urllib。
urllib提供了一系列如urlretrieve,quote等函數,而在urllib2中並無。
'''

'''
urllib2模塊:https://docs.python.org/2/library/urllib2.html

urllib2.urlopen(url[, data[, timeout[, cafile[, capath[, cadefault[, context]]]]])
urllib2.install_opener(opener)
urllib2.build_opener([handler, ...])
exception urllib2.URLError
exception urllib2.HTTPError

Request類:
class urllib2.Request(url[, data][, headers][, origin_req_host][, unverifiable])
Request.add_data(data)
Request.get_method()
Request.has_data()
Request.get_data()
Request.add_header(key, val)
Request.add_unredirected_header(key, header)
Request.has_header(header)
Request.get_full_url()
Request.get_type()
Request.get_host()
Request.get_selector()
Request.get_header(header_name, default=None)
Request.header_items()
Request.set_proxy(host, type)
Request.get_origin_req_host()
Request.is_unverifiable()

OpenerDirector類:
class urllib2.OpenerDirector
OpenerDirector.add_handler(handler)
OpenerDirector.open(url[, data][, timeout])
OpenerDirector.error(proto[, arg[, ...]])

BaseHandler類:
class urllib2.BaseHandler
BaseHandler.add_parent(director)
BaseHandler.close()
BaseHandler.parent
BaseHandler.default_open(req)
BaseHandler.protocol_open(req)
BaseHandler.unknown_open(req)
BaseHandler.http_error_default(req, fp, code, msg, hdrs)
BaseHandler.http_error_nnn(req, fp, code, msg, hdrs)
BaseHandler.protocol_request(req)
BaseHandler.protocol_response(req, response)

HTTPDefaultErrorHandler類:
class urllib2.HTTPDefaultErrorHandler

HTTPRedirectHandler類:
class urllib2.HTTPRedirectHandler
HTTPRedirectHandler.redirect_request(req, fp, code, msg, hdrs, newurl)
HTTPRedirectHandler.http_error_301(req, fp, code, msg, hdrs)
HTTPRedirectHandler.http_error_302(req, fp, code, msg, hdrs)
HTTPRedirectHandler.http_error_303(req, fp, code, msg, hdrs)
HTTPRedirectHandler.http_error_307(req, fp, code, msg, hdrs)

HTTPCookieProcessor類:
class urllib2.HTTPCookieProcessor([cookiejar])
HTTPCookieProcessor.cookiejar

ProxyHandler類:
class urllib2.ProxyHandler([proxies])
ProxyHandler.protocol_open(request)

HTTPPasswordMgr類:
class urllib2.HTTPPasswordMgr
HTTPPasswordMgr.add_password(realm, uri, user, passwd)
HTTPPasswordMgr.find_user_password(realm, authuri)

HTTPPasswordMgrWithDefaultRealm類:
class urllib2.HTTPPasswordMgrWithDefaultRealm

AbstractBasicAuthHandler類:
class urllib2.AbstractBasicAuthHandler([password_mgr])
AbstractBasicAuthHandler.http_error_auth_reqed(authreq, host, req, headers)

HTTPBasicAuthHandler類:
class urllib2.HTTPBasicAuthHandler([password_mgr])
HTTPBasicAuthHandler.http_error_401(req, fp, code, msg, hdrs)

ProxyBasicAuthHandler類:
class urllib2.ProxyBasicAuthHandler([password_mgr])
ProxyBasicAuthHandler.http_error_407(req, fp, code, msg, hdrs)

AbstractDigestAuthHandler類:
class urllib2.AbstractDigestAuthHandler([password_mgr])
AbstractDigestAuthHandler.http_error_auth_reqed(authreq, host, req, headers)

HTTPDigestAuthHandler類:
class urllib2.HTTPDigestAuthHandler([password_mgr])
HTTPDigestAuthHandler.http_error_401(req, fp, code, msg, hdrs)

ProxyDigestAuthHandler類:
class urllib2.ProxyDigestAuthHandler([password_mgr])
ProxyDigestAuthHandler.http_error_407(req, fp, code, msg, hdrs)

HTTPHandler類:
class urllib2.HTTPHandler
HTTPHandler.http_open(req)

HTTPSHandler類:
class urllib2.HTTPSHandler([debuglevel[, context]])
HTTPSHandler.https_open(req)

FileHandler類:
class urllib2.FileHandler
FileHandler.file_open(req)

FTPHandler類:
class urllib2.FTPHandler
FTPHandler.ftp_open(req)

CacheFTPHandler類:
class urllib2.CacheFTPHandler
CacheFTPHandler.setTimeout(t)
CacheFTPHandler.setMaxConns(m)

UnknownHandler類:
class urllib2.UnknownHandler
UnknownHandler.unknown_open()

HTTPErrorProcessor類:
class urllib2.HTTPErrorProcessor
HTTPErrorProcessor.http_response()
HTTPErrorProcessor.https_response()
'''


def test_urllib2():
    # 獲取頁面,顯示前100個字節
    f = urllib2.urlopen('https://www.baidu.com')
    print f.read(100)

    # 設置請求頭
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'}
    # 建立Request對象,傳入請求頭
    req = urllib2.Request(url='https://www.baidu.com', headers=headers)
    # 傳入Request對象來接收頁面
    resp = urllib2.urlopen(req)
    # 讀取頁面文本
    html = resp.read()
    print '*' * 200
    print html
    print '*' * 200
    print resp.getcode()  # 響應碼
    print resp.geturl()  # url
    print resp.info()  # 報頭


if __name__ == '__main__':
    test_urllib2()

源碼可於github下載:https://github.com/gkimeeq/PythonLearninghtml

相關文章
相關標籤/搜索