Requests 惟一的一個非轉基因的 Python HTTP 庫,人類能夠安全享用。html
警告:非專業使用其餘 HTTP 庫會致使危險的反作用,包括:安全缺陷症、冗餘代碼症、從新發明輪子症、啃文檔症、抑鬱、頭疼、甚至死亡。python
由於在使用urllib模塊的時候,會有諸多不便之處,總結以下:git
使用requests模塊:github
如何使用requests模塊web
# 各類請求方式:經常使用的就是requests.get()和requests.post() >>> import requests >>> r = requests.get('https://api.github.com/events') >>> r = requests.post('http://httpbin.org/post', data = {'key':'value'}) >>> r = requests.put('http://httpbin.org/put', data = {'key':'value'}) >>> r = requests.delete('http://httpbin.org/delete') >>> r = requests.head('http://httpbin.org/get') >>> r = requests.options('http://httpbin.org/get')
基本請求ajax
import requests response=requests.get('http://dig.chouti.com/') print(response.text)
帶參數的GET請求->paramsjson
# 本身拼接GET參數 # 在請求頭內將本身假裝成瀏覽器,不然百度不會正常返回頁面內容 import requests response=requests.get('https://www.baidu.com/s?wd=python&pn=1', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', }) print(response.text) # 若是查詢關鍵詞是中文或者有其餘特殊符號,則不得不進行url編碼 from urllib.parse import urlencode wd='egon老師' encode_res=urlencode({'k':wd},encoding='utf-8') keyword=encode_res.split('=')[1] print(keyword) # 而後拼接成url url='https://www.baidu.com/s?wd=%s&pn=1' %keyword response=requests.get(url, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', }) res1=response.text
# params參數的使用 # 上述操做能夠用requests模塊的一個params參數搞定,本質仍是調用urlencode from urllib.parse import urlencode wd='egon老師' pn=1 response=requests.get('https://www.baidu.com/s', params={ 'wd':wd, 'pn':pn }, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', }) res2=response.text #驗證結果,打開a.html與b.html頁面內容同樣 with open('a.html','w',encoding='utf-8') as f: f.write(res1) with open('b.html', 'w', encoding='utf-8') as f: f.write(res2)
帶參數的GET請求->headersapi
# 一般咱們在發送請求時都須要帶上請求頭,請求頭是將自身假裝成瀏覽器的關鍵,常見的有用的請求頭以下 Host Referer # 大型網站一般都會根據該參數判斷請求的來源 User-Agent # 客戶端 Cookie # Cookie信息雖然包含在請求頭裏,但requests模塊有單獨的參數來處理他headers={}內就不要放它了
# 添加headers(瀏覽器會識別請求頭,不加可能會被拒絕訪問,好比訪問https://www.zhihu.com/explore) import requests response=requests.get('https://www.zhihu.com/explore') response.status_code #500 # 本身定製headers headers={ 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', } respone=requests.get('https://www.zhihu.com/explore', headers=headers) print(respone.status_code) #200
帶參數的GET請求->cookies瀏覽器
# 登陸github,而後從瀏覽器中獲取cookies,之後就能夠直接拿着cookie登陸了,無需輸入用戶名密碼 # 用戶名:egonlin 郵箱767124330@qq.com 密碼jiumo@123 import requests Cookies={ 'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc', } response=requests.get('https://github.com/settings/emails', cookies=Cookies) # github對請求頭沒有什麼限制,咱們無需定製user-agent,對於其餘網站可能還須要定製 print('378533872@qq.com' in response.text) #True
介紹安全
# GET請求 HTTP默認的請求方法就是GET * 沒有請求體 * 數據必須在1K以內! * GET請求數據會暴露在瀏覽器的地址欄中 GET請求經常使用的操做: 1. 在瀏覽器的地址欄中直接給出URL,那麼就必定是GET請求 2. 點擊頁面上的超連接也必定是GET請求 3. 提交表單時,表單默認使用GET請求,但能夠設置爲POST # POST請求 (1). 數據不會出如今地址欄中 (2). 數據的大小沒有上限 (3). 有請求體 (4). 請求體中若是存在中文,會使用URL編碼! #!!!requests.post()用法與requests.get()徹底一致,特殊的是requests.post()有一個data參數,用來存放請求體數據
發送post請求,模擬瀏覽器的登陸行爲
# 對於登陸來講,應該輸錯用戶名或密碼而後分析抓包流程,用腦子想想,輸對了瀏覽器就跳轉了,還分析個毛線,累死你也找不到包
# 自動登陸github(本身處理cookie信息) ''' 一 目標站點分析 瀏覽器輸入https://github.com/login 而後輸入錯誤的帳號密碼,抓包 發現登陸行爲是post提交到:https://github.com/session 並且請求頭包含cookie 並且請求體包含: commit:Sign in utf8:✓ authenticity_token:lbI8IJCwGslZS8qJPnof5e7ZkCoSoMn6jmDTsL1r/m06NLyIbw7vCrpwrFAPzHMep3Tmf/TSJVoXWrvDZaVwxQ== login:egonlin password:123 二 流程分析 先GET:https://github.com/login拿到初始cookie與authenticity_token 返回POST:https://github.com/session, 帶上初始cookie,帶上請求體(authenticity_token,用戶名,密碼等) 最後拿到登陸cookie ps:若是密碼時密文形式,則能夠先輸錯帳號,輸對密碼,而後到瀏覽器中拿到加密後的密碼,github的密碼是明文 ''' import requests import re #第一次請求 r1=requests.get('https://github.com/login') r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被受權) authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #從頁面中拿到CSRF TOKEN #第二次請求:帶着初始cookie和TOKEN發送POST請求給登陸頁面,帶上帳號密碼 data={ 'commit':'Sign in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'317828332@qq.com', 'password':'alex3714' } r2=requests.post('https://github.com/session', data=data, cookies=r1_cookie ) login_cookie=r2.cookies.get_dict() #第三次請求:之後的登陸,拿着login_cookie就能夠,好比訪問一些我的配置 r3=requests.get('https://github.com/settings/emails', cookies=login_cookie) print('317828332@qq.com' in r3.text) #True
# requests.session()自動幫咱們保存cookie信息 import requests import re session=requests.session() #第一次請求 r1=session.get('https://github.com/login') authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #從頁面中拿到CSRF TOKEN #第二次請求 data={ 'commit':'Sign in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'767124330@qq.com', 'password':'jiumo123' } r2=session.post('https://github.com/session', data=data, ) #第三次請求 r3=session.get('https://github.com/settings/emails') print('317828332@qq.com' in r3.text) #True
補充
requests.post(url='xxxxxxxx', data={'xxx':'yyy'}) #沒有指定請求頭,#默認的請求頭:application/x-www-form-urlencoed #若是咱們自定義請求頭是application/json,而且用data傳值, 則服務端取不到值 requests.post(url='', data={'':1,}, headers={ 'content-type':'application/json' }) requests.post(url='', json={'':1,}, ) #默認的請求頭:application/jso
response屬性
import requests respone=requests.get('http://www.jianshu.com') # respone屬性 print(respone.text) print(respone.content) print(respone.status_code) print(respone.headers) print(respone.cookies) print(respone.cookies.get_dict()) print(respone.cookies.items()) print(respone.url) print(respone.history) print(respone.encoding) #關閉:response.close() from contextlib import closing with closing(requests.get('xxx',stream=True)) as response: for line in response.iter_content(): pass
編碼問題
# 編碼問題 import requests response=requests.get('http://www.autohome.com/news') # response.encoding='gbk' #汽車之家網站返回的頁面內容爲gb2312編碼的,而requests的默認編碼爲ISO-8859-1,若是不設置成gbk則中文亂碼 print(response.text)
獲取二進制數據
import requests response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg') with open('a.jpg','wb') as f: f.write(response.content)
#stream參數:一點一點的取,好比下載視頻時,若是視頻100G,用response.content而後一會兒寫到文件中是不合理的 import requests response=requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4', stream=True) with open('b.mp4','wb') as f: for line in response.iter_content(): f.write(line)
解析json
#解析json import requests response=requests.get('http://httpbin.org/get') import json res1=json.loads(response.text) #太麻煩 res2=response.json() #直接獲取json數據 print(res1 == res2) #True
import requests import os #指定搜索關鍵字 word = input('enter a word you want to search:') #自定義請求頭信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } #指定url url = 'https://www.sogou.com/web' #封裝get請求參數 prams = { 'query':word, 'ie':'utf-8' } #發起請求 response = requests.get(url=url,params=param) #獲取響應數據 page_text = response.text with open('./sougou.html','w',encoding='utf-8') as fp: fp.write(page_text)
import requests import os url = 'https://accounts.douban.com/login' #封裝請求參數 data = { "source": "movie", "redir": "https://movie.douban.com/", "form_email": "15027900535", "form_password": "bobo@15027900535", "login": "登陸", } #自定義請求頭信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } response = requests.post(url=url,data=data) page_text = response.text with open('./douban111.html','w',encoding='utf-8') as fp: fp.write(page_text)
# -*- coding:utf-8 -*- import requests import urllib.request if __name__ == "__main__": #指定ajax-get請求的url(經過抓包進行獲取) url = 'https://movie.douban.com/j/chart/top_list?' #定製請求頭信息,相關的頭信息必須封裝在字典結構中 headers = { #定製請求頭中的User-Agent參數,固然也能夠定製請求頭中其餘的參數 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } #定製get請求攜帶的參數(從抓包工具中獲取) param = { 'type':'5', 'interval_id':'100:90', 'action':'', 'start':'0', 'limit':'20' } #發起get請求,獲取響應對象 response = requests.get(url=url,headers=headers,params=param) #獲取響應內容:響應內容爲json串 print(response.text)
# -*- coding:utf-8 -*- import requests import urllib.request if __name__ == "__main__": #指定ajax-post請求的url(經過抓包進行獲取) url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' #定製請求頭信息,相關的頭信息必須封裝在字典結構中 headers = { #定製請求頭中的User-Agent參數,固然也能夠定製請求頭中其餘的參數 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } #定製post請求攜帶的參數(從抓包工具中獲取) data = { 'cname':'', 'pid':'', 'keyword':'北京', 'pageIndex': '1', 'pageSize': '10' } #發起post請求,獲取響應對象 response = requests.get(url=url,headers=headers,data=data) #獲取響應內容:響應內容爲json串 print(response.text)
import requests from fake_useragent import UserAgent ua = UserAgent(use_cache_server=False,verify_ssl=False).random headers = { 'User-Agent':ua } url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' pageNum = 3 for page in range(3,5): data = { 'on': 'true', 'page': str(page), 'pageSize': '15', 'productName':'', 'conditionType': '1', 'applyname':'', 'applysn':'' } json_text = requests.post(url=url,data=data,headers=headers).json() all_id_list = [] for dict in json_text['list']: id = dict['ID']#用於二級頁面數據獲取 #下列詳情信息能夠在二級頁面中獲取 # name = dict['EPS_NAME'] # product = dict['PRODUCT_SN'] # man_name = dict['QF_MANAGER_NAME'] # d1 = dict['XC_DATE'] # d2 = dict['XK_DATE'] all_id_list.append(id) #該url是一個ajax的post請求 post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for id in all_id_list: post_data = { 'id':id } response = requests.post(url=post_url,data=post_data,headers=headers) #該請求響應回來的數據有兩個,一個是基於text,一個是基於json的,因此能夠根據content-type,來獲取指定的響應數據 if response.headers['Content-Type'] == 'application/json;charset=UTF-8': #print(response.json()) #進行json解析 json_text = response.json() print(json_text['businessPerson'])