python內置的HTTP請求庫html
import urllib2
response= urllib2.url open("http://www.baidu.com")
複製代碼
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
複製代碼
# urllib.request.urlopen(url, data=NOne, [timeout,]*, cafile=None, capath=None, cadefault=False, context=None)
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))
複製代碼
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf-8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())
複製代碼
import socket
import urllib.request
import urllib.error
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
print(response.read())
try:
response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.rttot.URLError as e:
if isinstance(e.reason, socket.timeout):
print('TIME OUT')
複製代碼
httpbin.org 用來作http請求測試方便應用的網站。python
import urllib.request
response = urllib.request.urlopen("http://www.python.org")
print(type(response))
複製代碼
import urllib.request
response=urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
複製代碼
import urllib.request
request = urllib.request.Request("https://python.org")
reponse = urllib.request.urlopen(request)
print(reponse.read().decode("utf-8"))
複製代碼
from urllib import request, parse
url='http://httpbin.org'
headers = {
'User-Agent': 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
'Host':'httpbin.org'
}
data = {
'name': 'Germey'
}
data = bytes(parse.urlencode(data), encoding='utf-8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
複製代碼
import urllib.request
proxy_handler = urllib.request.ProxyHandler({
'http': 'http://127.0.0.1:9743',
'https': 'https://127.0.0.1:9743'
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://www.baidu.com')
print(response.read())
複製代碼
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
print('='.join([item.name, item.value]))
複製代碼
# cookie保存起來,在沒有過時的狀況下,直接讀取就能夠使用
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename) # 火狐瀏覽器cookie格式
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
cookie.save(ignore_discard=True, ignore_expires=True)
複製代碼
# cookie保存起來,在沒有過時的狀況下,直接讀取就能夠使用
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename) # 另一種格式cookie
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
cookie.save(ignore_discard=True, ignore_expires=True)
複製代碼
# 讀取本地保存的cookie信息
import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookiejar() # 保存時對應的格式
cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode("utf-8"))
複製代碼
from urllib import request, error
try:
response = request.urlopen("http://ccccccc.com/index.html")
except error.URLError as e:
print(e.reason)
# 能夠重試或者換url等
複製代碼
# error異常對象 reason code
from urllib import request, error
try:
response = request.urlopen("http://ccccccc.com/index.html")
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
print(e.reason)
else:
print("Request Successfully!")
複製代碼
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('http://www.baidu.com', timeout=0.01)
except urllib.error.URLError as e:
print(type(e.reason))
if isinstance(e.reason, socket.timeout):
print('TIME_OUT')
複製代碼
用來解析url信息鏈接。工具模塊,用來url解析的一類函數瀏覽器
urllib.parse.urlparse(urlstring, scheme='', allow_fragment=True)
# fragment 錨點連接
# allow_fragment 是否忽略fragment
複製代碼
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(result)
result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result) # scheme: http
result = urlparse('www.baidu.com/index.html;user?id=5#comment', allow_fragment='https')
print(result)
result = urlparse('www.baidu.com/index.html#comment', allow_fragment='https')
print(result)
複製代碼
反向解析,能夠獲得url,相似拼接cookie
from urllib.parse import urlunparse
data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))
複製代碼
用來拼接urlsocket
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com', 'FAQ.html'))
print(urljoin('http://www.baidu.com', 'http://cccccccc.com/FAQ.html')) # 之後一個參數的域名爲準,沒有的話從前一個獲取
複製代碼
url編碼,能夠直接將字典轉爲url參數格式。函數
from urllib.parse import urlencode
params = {
'name': 'germey',
'age': 22
}
base_rul = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)
複製代碼
解析robot.txt判斷哪些url能夠訪問爬取,哪些部分能夠爬取。工具