什麼是Urllib:html
python內置的HTTP請求庫python
urllib.request : 請求模塊cookie
urllib.error : 異常處理模塊socket
urllib.parse: url解析模塊post
urllib.robotparser : robots.txt解析模塊測試
GET請求方式
POST請求方式
超時timeout,異常處理
響應類型(響應碼,響應頭...)
POST請求添加Headers
代理方法
cookie添加 讀取
---------- parse 包下 -----------
urlparse 解析網址
urlunparse 拼接網址
urlencode GET參數化(比較有用)
python3:網站
urlopenui
# urllib參數 urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None) # url post數據 超時 ############################# import urllib.request # GET方式(不加data) response = urllib.request.urlopen('http://www.baidu.com') # 請求數據 print(response.read().decode('utf-8')) # 轉換爲字符串編碼,read()獲得的是byte格式的 ############################# import urllib.parse import urllib.request # POST方式(加data) data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') response = urllib.request.urlopen('http://httpbin.org/post', data=data) # http://httpbin.org/post 用來作HTTP測試的網址 print(response.read()) ############################# import urllib.request #超時設置 response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) print(response.read()) ##############################
響應編碼
# 響應類型 import urllib.request response = urllib.request.urlopen('https://www.python.org') print(type(response)) #<class 'http.client.HTTPResponse'> ############################# # 狀態碼、響應頭 import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.status) #獲取狀態碼 print(response.getheaders()) # 獲取響應頭 print(response.getheader('Server')) # 響應的服務 ############################# import urllib.request #獲取響應內容 response = urllib.request.urlopen('https://www.python.org') print(response.read().decode('utf-8')) # read() 獲取bytes類型
Requesturl
# 加入headers,發送一個POST請求 from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, headers=headers, method='POST') response = request.urlopen(req) print(response.read().decode('utf-8'))
Handler
# 添加代理 import urllib.request proxy_handler = urllib.request.ProxyHandler({ # 代理設置 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9742' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
Cookie
import http.cookiejar, urllib.request # 獲取cookies cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') for item in cookie: print(item.name+"="+item.value) ############################# # 將cookie保存爲txt文件 import http.cookiejar, urllib.request filename = 'cookie.txt' cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True) ############################# # 讀取cookie文件 import http.cookiejar, urllib.request # 用哪一種格式存cookies,就用哪一種方法讀取 cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
異常處理
# 異常處理1 from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.URLError as e: print(e.reason) ############################# # 異常處理2 from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason, e.code, e.headers, sep='\n') except error.URLError as e: print(e.reason) else: print('Request Successfully') ############################# # 異常處理3 import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print('TIME OUT')
URL解析
# 一個參數 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') print(type(result), result) ########################## # 指定協議, 若是沒有取https, 有就用url帶的 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https') # 指定協議類型 print(result) ########################## # allow_fragments=False 通常不會用,把錨連接部分移動到參數(沒有參數在往前移動#XXXX) from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False) print(result)
from urllib.parse import urlunparse # 拼接網站 data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] print(urlunparse(data)) #http://www.baidu.com/index.html;user?a=6#comment
from urllib.parse import urljoin # 拼接 print(urljoin('http://www.baidu.com', 'Faq.html')) # 以第二個位基準 print(urljoin('http://www.baidu.com', 'https://www.baidu.com/aaa')) # 拼接 print(urljoin('http://www.baidu.com', '?a=1'))
# 參數化get參數 from urllib.parse import urlencode params = { 'name': 'germey', 'age': 22 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params) print(url) # http://www.baidu.com?name=germey&age=22