import requests
response = requests.get('http://www.baidu.com')
# 在 Python3中 decode不帶參數,默認爲 utf-8 解碼
print(response.content.decode())
# 根據HTTP 頭部對響應的編碼做出有根據的推測,推測的文本編碼, 此處輸出亂碼
print(response.text)
# 輸出當前推測的文本編碼爲 ISO-8859-1
print(response.encoding)
# 修改成utf-8
response.encoding = 'utf-8'
# 此處輸出正常, 與 response.content.decode() 輸出一致
print(response.text)
複製代碼
response.text 與 response.content 的區別html
response.text
類型:str
解碼類型: 根據HTTP 頭部對響應的編碼做出有根據的推測,推測的文本編碼
如何修改編碼方式:response.encoding = 'utf-8'python
response.content
類型:bytes
解碼類型: 沒有指定
如何修改編碼方式:response.content.deocde(「utf8」)web
import requests
response = requests.get('https://www.baidu.com/img/bd_logo1.png?where=super')
with open('img.png', 'wb') as f:
f.write(response.content)
複製代碼
response.textchrome
respones.content瀏覽器
response.status_code安全
狀態碼服務器
response.request.urlcookie
請求的URL地址網絡
response.request.headerssession
請求頭
{
'User-Agent': 'python-requests/2.19.1',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive'
}
複製代碼
response.headers
響應頭
{
'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform',
'Connection': 'Keep-Alive',
'Content-Encoding': 'gzip',
'Content-Type': 'text/html',
'Date': 'Sun, 13 Jan 2019 02:15:14 GMT',
'Last-Modified': 'Mon, 23 Jan 2017 13:27:32 GMT',
'Pragma': 'no-cache',
'Server': 'bfe/1.0.8.18',
'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/',
'Transfer-Encoding': 'chunked'
}
複製代碼
header
帶header是爲了模擬瀏覽器,欺騙服務器,獲取和瀏覽器一致的內容
header的形式:字典
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
複製代碼
用法: requests.get(url,headers=headers)
參數
參數的形式:字典
params = {
"wd": "python"
}
複製代碼
用法:requests.get(url,params=kw)
代碼示例:
import requests
url = 'http://www.baidu.com/s?'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
params = {
"wd": "python"
}
response = requests.get(url, headers=headers, params=params)
print(response.request.url)
print(response.status_code)
#格式化形式
url_param = 'http://www.baidu.com/s?wd={}'.format('python')
response1 = requests.get(url_param, headers=headers)
print(response.request.url)
print(response.status_code)
複製代碼
import requests
class TiebaSpider(object):
def __init__(self, tieba_name):
self.name = tieba_name
self.url_tmp = 'https://tieba.baidu.com/f?kw=' + tieba_name + '&ie=utf-8&pn={}'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
def get_url_list(self):
# url_list = []
# for i in range(1000):
# url_list.append(self.url_temp.format(i*50))
# return url_list
'''
[i * 2 for i in range(3)]
[0, 2, 4]
'''
return [self.url_tmp.format(i * 50) for i in range(1000)]
def parse_url(self, url):
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def save_html(self, html, page_num):
file_path = '{}-第{}頁.html'.format(self.name, page_num)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(html)
def run(self):
# 1. 構造 url_list
url_list = self.get_url_list()
# 2. 遍歷請求
for url in url_list:
html = self.parse_url(url)
# 3. 保存
page_num = url_list.index(url) + 1
self.save_html(html, page_num)
if __name__ == '__main__':
# tieba_spider = TiebaSpider('李毅')
tieba_spider = TiebaSpider('lol')
tieba_spider.run()
複製代碼
須要用到 POST 的狀況:
用法:
response = requests.post("http://www.baidu.com/", data = data,headers=headers)
複製代碼
注意和 GET 的區別, GET 中爲 params=data
, data 爲字典形式
示例:
百度翻譯API
緣由:
代理工做流程:
正向代理與反向代理:
通常狀況下,不知道最終服務器的地址爲反向代理,知道最終服務器的爲正向代理。
用法:
requests.get("http://www.baidu.com", proxies = proxies)
複製代碼
proxies 爲字典形式
proxies = {
"http": "http://12.34.56.79:9527",
"https": "https://12.34.56.79:9527",
}
複製代碼
私密代理
若是代理須要使用HTTP Basic Auth,可使用下面這種格式:
proxies = {
"http": "http://user:password@10.1.10.1:1234"
}
複製代碼
使用代理 ip:
示例:
import requests
proxies = {
"http": "http://119.101.113.180:9999"
}
response = requests.get("http://www.baidu.com", proxies=proxies)
print(response.status_code)
複製代碼
獲取登陸後的頁面的三種方式
實例化session,使用session發送post請求,在使用他獲取登錄後的頁面
headers中添加cookie鍵,值爲cookie字符串
在請求方法中添加cookies參數,接收字典形式的cookie。字典形式的cookie中的鍵是cookie的name對應的值,值是cookie的value對應的值
session 代碼示例:
import requests
session = requests.session()
url = 'http://www.renren.com/PLogin.do'
data = {
"email": "****",
"password": "*****"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
session.post(url, headers=headers, data=data)
response = session.get('http://www.renren.com/969487434/profile', headers=headers)
html = response.content.decode()
with open('renren1.html', 'w', encoding='utf-8') as f:
f.write(html)
複製代碼
headers 添加 Cookie 示例:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Cookie": "anonymid=jr2yoyv4-l71jfd; depovince=SD; _r01_=1; JSESSIONID=abc5vZNDY5GXOfh79uKHw; ick_login=8e8d2154-31f7-47d6-afea-cd7da7f60cd7; ick=47b5a827-ecaf-4b44-ab57-c433e8f73b67; first_login_flag=1; ln_uact=13654252805; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebe_key=cf5d55ad-7eb1-4b50-848a-25d4c8081154%7C07a531353a345fda40d3ab252602e2f6%7C1547871575690%7C1%7C1547871574048; wp_fold=0; jebecookies=9724d2c7-5e9c-4be9-92d0-bf9b6dffd455|||||; _de=D0539E08F82219B3A527C713E360D2ED; p=7f8736045559e52d93420c14f063d70e4; t=522278d3c40436e9d5e7b3dc2650e55a4; societyguester=522278d3c40436e9d5e7b3dc2650e55a4; id=969487434; ver=7.0; xnsid=9ba2d506; loginfrom=null"
}
response = requests.get('http://www.renren.com/969487434/profile', headers=headers)
html = response.content.decode()
with open('renren2.html', 'w', encoding='utf-8') as f:
f.write(html)
複製代碼
在請求方法中添加cookies參數:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
Cookie = 'anonymid=jr2yoyv4-l71jfd; depovince=SD; _r01_=1; JSESSIONID=abc5vZNDY5GXOfh79uKHw; ick_login=8e8d2154-31f7-47d6-afea-cd7da7f60cd7; ick=47b5a827-ecaf-4b44-ab57-c433e8f73b67; first_login_flag=1; ln_uact=13654252805; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebe_key=cf5d55ad-7eb1-4b50-848a-25d4c8081154%7C07a531353a345fda40d3ab252602e2f6%7C1547871575690%7C1%7C1547871574048; wp_fold=0; jebecookies=9724d2c7-5e9c-4be9-92d0-bf9b6dffd455|||||; _de=D0539E08F82219B3A527C713E360D2ED; p=7f8736045559e52d93420c14f063d70e4; t=522278d3c40436e9d5e7b3dc2650e55a4; societyguester=522278d3c40436e9d5e7b3dc2650e55a4; id=969487434; ver=7.0; xnsid=9ba2d506; loginfrom=null'
# 字典推導式
cookies = {i.split("=")[0] : i.split("=")[1] for i in Cookie.split("; ")}
response = requests.get('http://www.renren.com/969487434/profile', headers=headers, cookies=cookies)
html = response.content.decode()
with open('renren3.html', 'w', encoding='utf-8') as f:
f.write(html)
複製代碼
查看 HTML 頁面,在form表單中尋找action對應的url地址
抓包,尋找登陸的url地址
reqeusts.util.dict_from_cookiejar
把cookie對象轉化爲字典
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Cookie": "anonymid=jr2yoyv4-l71jfd; depovince=SD; _r01_=1; JSESSIONID=abc5vZNDY5GXOfh79uKHw; ick_login=8e8d2154-31f7-47d6-afea-cd7da7f60cd7; ick=47b5a827-ecaf-4b44-ab57-c433e8f73b67; first_login_flag=1; ln_uact=13654252805; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebe_key=cf5d55ad-7eb1-4b50-848a-25d4c8081154%7C07a531353a345fda40d3ab252602e2f6%7C1547871575690%7C1%7C1547871574048; wp_fold=0; jebecookies=9724d2c7-5e9c-4be9-92d0-bf9b6dffd455|||||; _de=D0539E08F82219B3A527C713E360D2ED; p=7f8736045559e52d93420c14f063d70e4; t=522278d3c40436e9d5e7b3dc2650e55a4; societyguester=522278d3c40436e9d5e7b3dc2650e55a4; id=969487434; ver=7.0; xnsid=9ba2d506; loginfrom=null"
}
response = requests.get('http://www.renren.com/969487434/profile', headers=headers)
print(response.cookies)
print(requests.utils.dict_from_cookiejar(response.cookies))
複製代碼
請求 SSL 證書驗證
response = requests.get("https://www.12306.cn/mormhweb/ ", verify=False)
複製代碼
設置超時
response = requests.get("https://www.baidu.com ", timeout=2)
複製代碼
配合狀態碼判斷是否請求成功
assert response.status_code == 200
複製代碼
示例代碼(可作工具類):
import requests
from retrying import retry
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
@retry(stop_max_attempt_number=3)
def _request_rul(url, method, data):
if method == 'POST':
response = requests.post(url, headers=headers, data=data)
else:
response = requests.get(url, headers=headers, params=data, timeout=3)
assert response.status_code
return response.content.decode()
def request_url(url, method='GET', data=None):
try:
html = _request_rul(url, method, data)
except:
html = None
return html
if __name__ == '__main__':
url = 'http://www.baidu.com'
print(request_url(url))
複製代碼
若是是Web客戶端驗證,須要添加 auth = (帳戶名, 密碼)
import requests
auth=('test', '123456')
response = requests.get('http://192.168.199.107', auth = auth)
print (response.text)複製代碼