發送HTTP請求的第三方庫,比起以前用到的urllib,requests模塊的api更加便捷(本質就是封裝了urllib3)
安裝:pip3 install requests
學習requests前,能夠先熟悉下HTTP協議
http://www.cnblogs.com/linhaifeng/p/6266327.html
import requests from urllib import parse param={'wd':'中國'} # 對url進行傳參 response = requests.get('http://www.baidu.com/s?', params=param) print(response.url) # url解碼 ASCII --》utf8 print(parse.unquote(response.url)) >>輸出 http://www.baidu.com/s?wd=%E4%B8%AD%E5%9B%BD http://www.baidu.com/s?wd=中國
一般咱們在發送請求時都須要帶上請求頭,請求頭是將自身假裝成瀏覽器的關鍵html
# 添加headers(瀏覽器會識別請求頭,不加可能會被拒絕訪問,好比訪問https://www.zhihu.com/explore) import requests response = requests.get('https://www.zhihu.com/explore') print(response.status_code) # 返回500錯誤 # 本身定製headers headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" } respone = requests.get('https://www.zhihu.com/explore',headers = headers) print(respone.status_code) # 返回200
headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36", } loginUrl = 'https://github.com/login' # 獲取cookies cookies = response.cookies print('cookies=>',cookies)
import requests proxies={ 'http':'111.47.220.67:8080', 'https':'111.47.220.67:8080', } response = requests.get('https://www.zhihu.com/explore', proxies= proxies,headers = headers, verify=False) print(response.status_code)
import requests #timeout表明接收數據的超時時間 response = requests.get('https://www.baidu.com', timeout=1) print(response.status_code)
import requests response = requests.get('http://www.jianshu.com') # response屬性 print(response.text) # 文本數據str 通過轉碼的 print(response.content) # 原始數據字節串bytes print(response.status_code) # 返回狀態碼 200 print(response.headers) print(response.cookies) print(response.cookies.get_dict()) print(response.cookies.items()) print(response.url) print(response.history) print(response.encoding)
# 編碼問題 import requests response = requests.get('http://www.autohome.com/news') print(response.headers['Content-Type']) # 返回text/html # 若是返回值不包括charset元素,默認返回編碼爲ISO-8859-1 print(response.encoding) # 返回ISO-8859-1 按ISO-8859-1方式解碼text response.encoding = 'GBK' # 汽車之家網站返回的頁面內容爲gb2312編碼的,而requests的默認編碼爲ISO-8859-1,若是不設置成gbk則中文亂碼 print(response.text) response = requests.get('https://www.jianshu.com') print(response.headers['Content-Type']) # 返回text/html; charset=utf-8 # 返回值包括charset元素,返回編碼爲charset後的編碼 print(response.encoding) # 返回utf-8 ,按utf-8方式解碼text print(response.text) # 簡書返回的頁面內容爲utf-8編碼的,在這裏不用設置response.encoding = 'utf-8'
# 解析json import requests import json response = requests.get('http://httpbin.org/get') res1 = json.loads(response.text) # 以往獲取方式太麻煩 res2 = response.json() # 直接獲取json數據 print(res2) print(res1 == res2) # True
import requests response = requests.get('http://pic-bucket.nosdn.127.net/photo/0005/2018-02-26/DBIGGI954TM10005NOS.jpg') with open('a.jpg', 'wb') as f: f.write(response.content) # stream參數:一點一點的取,好比下載視頻時,若是視頻100G,用response.content而後一會兒寫到文件中是不合理的 response = requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4', stream = True) with open('b.mp4','wb') as f: # 獲取二進制流(iter_content) for line in response.iter_content(): f.write(line)
#GET請求 HTTP默認的請求方法就是GET * 沒有請求體 * 數據必須在1K以內! * GET請求數據會暴露在瀏覽器的地址欄中 GET請求經常使用的操做: 1. 在瀏覽器的地址欄中直接給出URL,那麼就必定是GET請求 2. 點擊頁面上的超連接也必定是GET請求 3. 提交表單時,表單默認使用GET請求,但能夠設置爲POST #POST請求 (1). 數據不會出如今地址欄中 (2). 數據的大小沒有上限 (3). 有請求體 (4). 請求體中若是存在中文,會使用URL編碼! #!!!requests.post()用法與requests.get()徹底一致,特殊的是requests.post()有一個data參數,用來存放請求體數據
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/2/27 20:42 # @Author : hyang # @Site : # @File : request_github.py # @Software: PyCharm import re import requests import http.cookiejar as cookielib from requests.packages import urllib3 ''' 一 目標站點分析 瀏覽器輸入https://github.com/login 而後輸入錯誤的帳號密碼,經過Fiddle抓包 發現登陸行爲是post提交到:https://github.com/session 並且請求頭包含cookie 並且請求體包含: commit:Sign in utf8:✓ authenticity_token:lbI8IJCwGslZS8qJPnof5e7ZkCoSoMn6jmDTsL1r/m06NLyIbw7vCrpwrFAPzHMep3Tmf/TSJVoXWrvDZaVwxQ== login:908099665@qq.com password:123 二 流程分析 先GET:https://github.com/login拿到初始cookie與authenticity_token 返回POST:https://github.com/session, 帶上初始cookie,帶上請求體(authenticity_token,用戶名,密碼等) 最後拿到登陸cookie ps:若是密碼時密文形式,則能夠先輸錯帳號,輸對密碼,而後到瀏覽器中拿到加密後的密碼,github的密碼是明文 ''' import ssl # 解決某些環境下報<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed ssl._create_default_https_context = ssl._create_unverified_context urllib3.disable_warnings() # 關閉警告 headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36", } loginUrl = 'https://github.com/login' postUrl = 'https://github.com/session' response = requests.get(loginUrl, headers=headers, verify=False) # 獲取authenticity_token authenticity_token = re.findall(r'<input name="authenticity_token" type="hidden" value="(.*?)" />', response.text) # 獲取cookies cookies = response.cookies print('cookies=>',cookies) print('authenticity_token=>',authenticity_token) email='908099665@qq.com' password='yanghaoXXXX' post_data={ "commit":"Sign in", "utf8":"✓", "authenticity_token":authenticity_token, "login":email, "password":password, } response2 = requests.post(postUrl, data=post_data, headers=headers, verify=False, cookies=cookies) print(response2.status_code) print(response2.history) # 跳轉的歷史狀態碼 print(response2.text)
分析抓包python
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/2/26 23:31 # @Author : hyang # @Site : # @File : request-github.py # @Software: PyCharm import re import requests import urllib3 import http.cookiejar as cookielib import ssl # 解決某些環境下報<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed ssl._create_default_https_context = ssl._create_unverified_context urllib3.disable_warnings() # 關閉警告 headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36", } loginUrl = 'https://github.com/login' postUrl = 'https://github.com/session' profileUrl = 'https://github.com/settings/emails' session = requests.session() # 包括了cookies信息 # 生成 github_cookie文件 session.cookies = cookielib.LWPCookieJar(filename='github_cookie') # 獲取authenticity_token def get_token(): response = session.get(loginUrl, headers=headers, verify=False) html = response.text authenticity_token = re.findall(r'<input name="authenticity_token" type="hidden" value="(.*?)" />', html) print(authenticity_token) return authenticity_token # 登錄表單提交參數 def post_account(email, password): post_data = { 'commit': 'Sign in', 'utf8':'✓', 'authenticity_token': get_token(), 'login': email, 'password': password } response = session.post(postUrl, data=post_data, headers=headers) print(response.status_code) # 保存cookies session.cookies.save() def load_cookie(): try: session.cookies.load(ignore_discard=True) print('cookie 獲取成功') except: print('cookie 獲取不成功') # 判斷是否登錄成功 def isLogin(): load_cookie() response = session.get(profileUrl, headers=headers) #print('908099665@qq.com' in response.text) return '908099665@qq.com' in response.text if __name__ == "__main__": # 輸入本身email帳號和密碼 post_account(email='908099665@qq.com', password='yanghaoXXXX') # 驗證是否登錄成功 isLogin()
By default Requests will perform location redirection for all verbs except HEAD. We can use the history property of the Response object to track redirection. The Response.history list contains the Response objects that were created in order to complete the request. The list is sorted from the oldest to the most recent response. For example, GitHub redirects all HTTP requests to HTTPS: >>> r = requests.get('http://github.com') >>> r.url 'https://github.com/' >>> r.status_code >>> r.history [<Response [301]>] If you're using GET, OPTIONS, POST, PUT, PATCH or DELETE, you can disable redirection handling with the allow_redirects parameter: >>> r = requests.get('http://github.com', allow_redirects=False) >>> r.status_code >>> r.history [] If you're using HEAD, you can enable redirection as well: >>> r = requests.head('http://github.com', allow_redirects=True) >>> r.url 'https://github.com/' >>> r.history [<Response [301]>] 先看官網的解釋
#證書驗證(大部分網站都是https) import requests respone=requests.get('https://www.12306.cn') #若是是ssl請求,首先檢查證書是否合法,不合法則報錯 #改進1:去掉報錯,可是會報警告 import requests respone=requests.get('https://www.12306.cn',verify=False) #不驗證證書,報警告,返回200 print(respone.status_code) #改進2:去掉報錯,而且去掉警報信息 import requests from requests.packages import urllib3 urllib3.disable_warnings() #關閉警告 respone=requests.get('https://www.12306.cn',verify=False) print(respone.status_code) #改進3:加上證書 #不少網站都是https,可是不用證書也能夠訪問,大多數狀況都是能夠攜帶也能夠不攜帶證書 #知乎\百度等都是可帶可不帶 #有硬性要求的,則必須帶,好比對於定向的用戶,拿到證書後纔有權限訪問某個特定網站 import requests respone=requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key')) print(respone.status_code)
import requests files = {'file':open('a.pptx','rb')} respone = requests.post('http://httpbin.org/post',files=files) print(respone.status_code)
# 異常處理 import requests from requests.exceptions import * # 能夠查看requests.exceptions獲取異常類型 try: r = requests.get('http://www.baiduxxx.com', timeout=1) except ReadTimeout: print('ReadTimeout') except ConnectionError: # 網絡不通 print('ConnectionError') # except Timeout: # print('aaaaa') except RequestException: print('Error')