import urllib.request
file=urllib.request.urlopen("http://yum.iqianyue.com",timeout=30) #timeout=30,表示30秒之後產生超時異常 data=file.read()
import urllib.request core_url = 'http://www.baidu.com/s?wd=' keywords = 'hello' full_url = core_url + keywords req = urllib.request.Request(full_url) data = urllib.request.urlopen(req).read() with open('hello.html', 'wb') as f: f.write(data)
上述關鍵詞若是變成中文,會出現報錯:UnicodeEncodeError: 'ascii' codec can't encode characters in position 10-11: ordinal not in range(128),緣由:python爬蟲之urllib庫(一)提到過URL編碼,URL只會認可一部分ASCII碼中字符,對於漢字等特殊符號是須要編碼的。對於一個參數使用字符串結合request模塊給URL傳參:urllib.request.quote(str);對於多個參數使用字典結合parse模塊給URL傳參:urllib.parse.urlencode(dict)。
import urllib.request core_url = 'http://www.baidu.com/s?wd=' keywords = '您好' keywords_encode = urllib.request.quote(keywords) # URL參數編碼 full_url = core_url + keywords_encode req = urllib.request.Request(full_url) data = urllib.request.urlopen(req).read() with open('hello.html', 'wb') as f: f.write(data)
import urllib.request import urllib.parse core_url = 'http://www.baidu.com/s?' # 關鍵詞字段減掉 keywords = { # 多個參數 'wd': '您好', 'rsv_spt': 1, 'rsv_iqid': 0x8c77175600037633, } keywords_encode = urllib.parse.urlencode(keywords) # 多個參數url編碼 full_url = core_url + keywords_encode req = urllib.request.Request(full_url) data = urllib.request.urlopen(req).read() with open('hello.html', 'wb') as f: f.write(data)
import urllib.request import urllib.parse url = 'http://data.stats.gov.cn/login.htm' # url必須是登陸或者註冊頁面的url地址 國家數據統計局官網登陸url form_data = { 'username': '545859297@qq.com', # 表單數據,登陸時輸入的信息,對應郵箱和密碼。再也不是url參數了,注意區分 'keyp': 'bushizhenmima', # 注意字典中的key須要使用頁面中input輸入框的name屬性的屬性值。別試我帳號密碼!!! # 瀏覽器打開上述網頁,確實驗證碼輸入,登陸不會成功 } form_data_deal = urllib.parse.urlencode(form_data).encode('utf-8') # POST請求data屬性須要傳入bytes類型,並且字典須要經過urlencode鏈接 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36' } req = urllib.request.Request(url, data=form_data_deal, headers=headers) data = urllib.request.urlopen(req).read() with open('country-data.html', 'wb') as f: f.write(data)
import urllib.request def handler_opener(): url = 'https://www.baidu.com' handler = urllib.request.HTTPHandler() # 常見HTTP處理器 opener = urllib.request.build_opener(handler) # 調用buile_open()建立opener對象 response = opener.open(url) # 調用open()方法發送HTTP請求 response_str = response.read().decode('utf-8') return response_str result = handler_opener() with open('baidu.html', 'w', encoding='utf-8') as f: f.write(result)
import urllib.request def free_proxy(): url = 'http://www.baidu.com' proxy = { 'http': 'http;//', # 分爲http和https兩種協議版本,https是更加安全的http,在http基礎上加入安全層SSL # 'https': '' } proxy_handler = urllib.request.ProxyHandler(proxy) # 建立代理處理器,使用ProxyHandle opener = urllib.request.build_opener(proxy_handler) response = opener.open(url) response_str = response.read() # 注意與上例不一樣 return response_str result = free_proxy() with open('baidu-free.html', 'wb') as f: # 注意與上例不一樣 f.write(result)
import urllib.request def free_proxy(): url = 'https://www.baidu.com' proxy = { 'http': 'http;//', # 分爲http和https兩種協議版本,https是更加安全的http,在http基礎上加入安全層SSL # 'https': '' } proxy_handler = urllib.request.ProxyHandler(proxy) # 建立代理處理器 opener = urllib.request.build_opener(proxy_handler, urllib.request.HTTPHandler) # 這個能夠缺省HTTPHandler,下面爲源碼解釋 ''' The opener will use several default handlers, including support for HTTP, FTP and when applicable HTTPS. If any of the handlers passed as arguments are subclasses of the default handlers, the default handlers will not be used. ''' response = opener.open(url) response_str = response.read() # 注意與上例不一樣 return response_str result = free_proxy() with open('baidu-free.html', 'wb') as f: # 注意與上例不一樣 f.write(result)
import urllib.request def fee_proxy(): url = 'http://www.baidu.com' # 付費代理IP第二種方式 user_name = 'admin' password = '123456' proxy_ip = '' proxy_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() # 建立密碼管理器 proxy_manager.add_password(None, proxy_ip, user_name, password) proxy_handler = urllib.request.ProxyBasicAuthHandler(proxy_manager) # 代理IP驗證處理器 proxy_opener = urllib.request.build_opener(proxy_handler) response = proxy_opener.open(url) response_str = response.read().decode('utf-8') return response_str data = fee_proxy() with open('baidu-fee.html', 'w', encoding='utf-8') as f: f.write(data)
import urllib.request url = 'http://www.baidu.com' http_handler = urllib.request.HTTPHandler(debuglevel=1) https_handler = urllib.request.HTTPSHandler(debuglevel=1) opener = urllib.request.build_opener(http_handler, https_handler) urllib.request.install_opener(opener) response = urllib.request.urlopen(url) # 請求方式一 # response = opener.open(url) # 請求方式二
import urllib.request import urllib.error url = 'http://sad.blog.csdn.net' try: rep = urllib.request.urlopen(url) except urllib.error.URLError as e: print(e) else: print(rep)
import urllib.request import urllib.error url = 'http://sad.blog.csdn.net' try: rep = urllib.request.urlopen(url) except urllib.error.HTTPError as e: print(e.code, e.reason) else: print(rep)
狀態碼 | 緣由短語(英文) | 緣由短語(中文) |
200 | OK | 正常 |
301 | Moved Permanently | 從新定向新的URL,永久性 |
302 | Found | 從新定向新的URL,非永久性 |
304 | Not Modified | 請求資源未更新 |
400 | Bad Request | 非法請求 |
401 | Unauthorized | 請求未經受權 |
403 | Forbidden | 禁止訪問 |
404 | Not Found | 沒有找到頁面 |
500 | Internal Server Error | 服務器內部錯誤 |
501 | Not Implemented | 服務器不支持實現請求功能 |
1 """Exception classes raised by urllib. 2 3 The base exception class is URLError, which inherits from OSError. It 4 doesn't define any behavior of its own, but is the base class for all 5 exceptions defined in this package. 6 7 HTTPError is an exception class that is also a valid HTTP response 8 instance. It behaves this way because HTTP protocol errors are valid 9 responses, with a status code, headers, and a body. In some contexts, 10 an application may want to handle an exception like a regular 11 response. 12 """ 13 14 import urllib.response 15 16 __all__ = ['URLError', 'HTTPError', 'ContentTooShortError'] 17 18 19 class URLError(OSError): 20 # URLError is a sub-type of OSError, but it doesn't share any of 21 # the implementation. need to override __init__ and __str__. 22 # It sets self.args for compatibility with other OSError 23 # subclasses, but args doesn't have the typical format with errno in 24 # slot 0 and strerror in slot 1. This may be better than nothing. 25 def __init__(self, reason, filename=None): 26 self.args = reason, 27 self.reason = reason 28 if filename is not None: 29 self.filename = filename 30 31 def __str__(self): 32 return '<urlopen error %s>' % self.reason 33 34 35 class HTTPError(URLError, urllib.response.addinfourl): 36 """Raised when HTTP error occurs, but also acts like non-error return""" 37 __super_init = urllib.response.addinfourl.__init__ 38 39 def __init__(self, url, code, msg, hdrs, fp): 40 self.code = code 41 self.msg = msg 42 self.hdrs = hdrs 43 self.fp = fp 44 self.filename = url 45 # The addinfourl classes depend on fp being a valid file 46 # object. In some cases, the HTTPError may not have a valid 47 # file object. If this happens, the simplest workaround is to 48 # not initialize the base classes. 49 if fp is not None: 50 self.__super_init(fp, hdrs, url, code) 51 52 def __str__(self): 53 return 'HTTP Error %s: %s' % (self.code, self.msg) 54 55 def __repr__(self): 56 return '<HTTPError %s: %r>' % (self.code, self.msg) 57 58 # since URLError specifies a .reason attribute, HTTPError should also 59 # provide this attribute. See issue13211 for discussion. 60 @property 61 def reason(self): 62 return self.msg 63 64 @property 65 def headers(self): 66 return self.hdrs 67 68 @headers.setter 69 def headers(self, headers): 70 self.hdrs = headers
import urllib.request import urllib.error try: urllib.request.urlopen("http://blog.csdn.net") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason)