一、基本的get請求web
import requests response = requests.get('http://httpbin.org/get') print(response.text) #以文本形式打印網頁源碼 # print(response.status_code) # 打印狀態碼 # print(response.url) # 打印請求url # print(response.headers) # 打印頭信息 # print(response.cookies) # 打印cookie信息 # print(response.content) #以二進制形式打印網頁源碼
結果shell
二、帶參數的GET請求(直接將參數放在url內):數據庫
import requests response = requests.get('http://httpbin.org/get?name=gemey&age=22') print(response.text)
結果json
三、帶參數的GET請求(先將參數填寫在dict中,發起請求時params參數指定爲dict):
import requests data = { 'name': 'tom', 'age': 20 } response = requests.get('http://httpbin.org/get', params=data) print(response.text)
四、json格式保存結果
import requests response = requests.get('http://httpbin.org/get') print(response.text) print(response.json()) #response.json()方法同json.loads(response.text) print(type(response.json()))
結果
五、二進制保存結果:
import requests response = requests.get('http://img.ivsky.com/img/tupian/pre/201708/30/kekeersitao-002.jpg') b = response.content with open('F://fengjing.jpg','wb') as f: f.write(b)
六、爲請求添加頭信息:
import requests heads = {'User-Agent':"'Mozilla/5.0 ' '(Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 ' '(KHTML, like Gecko) Version/5.1 Safari/534.50'"} response = requests.get('http://www.baidu.com',headers=heads)
七、使用代理
代理參數也要是一個dict,這裏使用requests庫爬取了IP代理網站的IP與端口和類型。由於是免費的,使用的代理地址很快就失效了。
import requests import re def get_html(url): proxy = { 'http': '120.25.253.234:812', 'https': '163.125.222.244:8123' } heads = {} heads['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0' req = requests.get(url, headers=heads,proxies=proxy) html = req.text return html def get_ipport(html): regex = r'<td data-title="IP">(.+)</td>' iplist = re.findall(regex, html) regex2 = '<td data-title="PORT">(.+)</td>' portlist = re.findall(regex2, html) regex3 = r'<td data-title="類型">(.+)</td>' typelist = re.findall(regex3, html) sumray = [] for i in iplist: for p in portlist: for t in typelist: pass pass a = t+','+i + ':' + p sumray.append(a) print('高匿代理') print(sumray) if __name__ == '__main__': url = 'http://www.kuaidaili.com/free/' get_ipport(get_html(url))
結果:
八、獲取cookie
#獲取cookie import requests response = requests.get('http://www.baidu.com') print(response.cookies) for k,v in response.cookies.items(): print(k+':'+v)
結果:
九、證書驗證設置
import requests from requests.packages import urllib3 urllib3.disable_warnings() #從urllib3中消除警告 response = requests.get('https://www.12306.cn',verify=False) #證書驗證設爲FALSE print(response.status_code)
打印結果:200
十、 發送cookies到服務器
要想發送你的cookies到服務器,可使用 cookies 參數:
import requests url = 'http://httpbin.org/cookies' cookies = {'testCookies_1': 'Hello_Python3', 'testCookies_2': 'Hello_Requests'}# 規定空格、方括號、圓括號、等於號、逗號、雙引號、斜槓、問號、@,冒號,分號等特殊符號都不能做爲Cookie的內容。 r = requests.get(url, cookies=cookies) print(r.json())
十一、超時異常捕獲
import requests from requests.exceptions import ReadTimeout try: res = requests.get('http://httpbin.org', timeout=0.1) print(res.status_code) except ReadTimeout: print(timeout)
十二、異常處理
在你不肯定會發生什麼錯誤時,儘可能使用try...except來捕獲異常
import requests from requests.exceptions import ReadTimeout,HTTPError,RequestException try: response = requests.get('http://www.baidu.com',timeout=0.5) print(response.status_code) except ReadTimeout: print('timeout') except HTTPError: print('httperror') except RequestException: print('reqerror')
全部的requests exception: exception requests.RequestException(*args, **kwargs) There was an ambiguous exception that occurred while handling your request. exception requests.ConnectionError(*args, **kwargs) A Connection error occurred. exception requests.HTTPError(*args, **kwargs) An HTTP error occurred. exception requests.URLRequired(*args, **kwargs) A valid URL is required to make a request. exception requests.TooManyRedirects(*args, **kwargs) Too many redirects. exception requests.ConnectTimeout(*args, **kwargs) The request timed out while trying to connect to the remote server. Requests that produced this error are safe to retry. exception requests.ReadTimeout(*args, **kwargs) The server did not send any data in the allotted amount of time. exception requests.Timeout(*args, **kwargs) The request timed out. Catching this error will catch both ConnectTimeout and ReadTimeout errors.
3、requests庫的用法--post
一、基本POST實例
import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post("http://httpbin.org/post", data=payload) print(ret.text)
二、發送請求頭和數據
import requests import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print(ret.text) print(ret.cookies)
三、json格式請求:
import requests import json class url_request(): def __init__(self): ''' init ''' if __name__ == '__main__': heard = {'Content-Type': 'application/json'} payload = {'CountryName': '中國', 'ProvinceName': '四川省', 'L1CityName': 'chengdu', 'L2CityName': 'yibing', 'TownName': '', 'Longitude': '107.33393', 'Latitude': '33.157131', 'Language': 'CN'} r = requests.post("http://www.xxxxxx.com/CityLocation/json/LBSLocateCity", heards=heard, data=payload) data_r = r.json() if r.status_code!=200: print('LBSLocateCity API Error' + str(r.status_code)) print(data_r['CityEntities'][0]['CityID']) # 打印返回json中的某個key的value print(data_r['ResponseStatus']['Ack']) print(json.dump(data, indent=4, sort_keys=True, ensure_ascii=False)) # 樹形打印json,ensure_ascii必須設爲False不然中文會顯示爲unicode
四、Xml請求:
import requests class url_request(): def __init__(self): """init""" if __name__ == '__main__': heards = {'Content-type': 'text/xml'} XML = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><Request xmlns="http://tempuri.org/"><jme><JobClassFullName>WeChatJSTicket.JobWS.Job.JobRefreshTicket,WeChatJSTicket.JobWS</JobClassFullName><Action>RUN</Action><Param>1</Param><HostIP>127.0.0.1</HostIP><JobInfo>1</JobInfo><NeedParallel>false</NeedParallel></jme></Request></soap:Body></soap:Envelope>' url = 'http://jobws.push.mobile.xxxxxxxx.com/RefreshWeiXInTokenJob/RefreshService.asmx' r = requests.post(url=url, heards=heards, data=XML) data_r = r.text print(data_r)
五、上傳文件
使用request模塊,也能夠上傳文件,文件的類型會自動進行處理:
import requests url = 'http://127.0.0.1:8080/upload' files = {'file': open('/home/rxf/test.jpg', 'rb')} #files = {'file': ('report.jpg', open('/home/lyb/sjzl.mpg', 'rb'))} #顯式的設置文件名 r = requests.post(url, files=files) print(r.text)
request更加方便的是,能夠把字符串看成文件進行上傳:
import requests url = 'http://127.0.0.1:8080/upload' files = {'file': ('test.txt', b'Hello Requests.')} #必需顯式的設置文件名 r = requests.post(url, files=files) print(r.text)
六、定製頭和cookie信息
header = {'user-agent': 'my-app/0.0.1''} cookie = {'key':'value'} r = requests.post('your url',headers=header,cookies=cookie)
import requests import json data = {'some': 'data'} headers = {'content-type': 'application/json','User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} r = requests.post('https://api.github.com/some/endpoint', data=data, headers=headers) print(r.text)
4、綜合
一、方法彙總
# HTTP請求類型 # get類型 r = requests.get('https://github.com/timeline.json') # post類型 r = requests.post("http://m.ctrip.com/post") # put類型 r = requests.put("http://m.ctrip.com/put") # delete類型 r = requests.delete("http://m.ctrip.com/delete") # head類型 r = requests.head("http://m.ctrip.com/head") # options類型 r = requests.options("http://m.ctrip.com/get") # 獲取響應內容 print(r.content) #以字節的方式去顯示,中文顯示爲字符 print(r.text) #以文本的方式去顯示 #URL傳遞參數 payload = {'keyword': '香港', 'salecityid': '2'} r = requests.get("http://m.ctrip.com/webapp/tourvisa/visa_list", params=payload) print(r.url) #示例爲http://m.ctrip.com/webapp/tourvisa/visa_list?salecityid=2&keyword=香港 #獲取/修改網頁編碼 r = requests.get('https://github.com/timeline.json') print (r.encoding) #json處理 r = requests.get('https://github.com/timeline.json') print(r.json()) # 須要先import json # 定製請求頭 url = 'http://m.ctrip.com' headers = {'User-Agent' : 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'} r = requests.post(url, headers=headers) print (r.request.headers) #複雜post請求 url = 'http://m.ctrip.com' payload = {'some': 'data'} r = requests.post(url, data=json.dumps(payload)) #若是傳遞的payload是string而不是dict,須要先調用dumps方法格式化一下 # post多部分編碼文件 url = 'http://m.ctrip.com' files = {'file': open('report.xls', 'rb')} r = requests.post(url, files=files) # 響應狀態碼 r = requests.get('http://m.ctrip.com') print(r.status_code) # 響應頭 r = requests.get('http://m.ctrip.com') print (r.headers) print (r.headers['Content-Type']) print (r.headers.get('content-type')) #訪問響應頭部份內容的兩種方式 # Cookies url = 'http://example.com/some/cookie/setting/url' r = requests.get(url) r.cookies['example_cookie_name'] #讀取cookies url = 'http://m.ctrip.com/cookies' cookies = dict(cookies_are='working') r = requests.get(url, cookies=cookies) #發送cookies #設置超時時間 r = requests.get('http://m.ctrip.com', timeout=0.001) #設置訪問代理 proxies = { "http": "http://10.10.1.10:3128", "https": "http://10.10.1.100:4444", } r = requests.get('http://m.ctrip.com', proxies=proxies) #若是代理須要用戶名和密碼,則須要這樣: proxies = { "http": "http://user:pass@10.10.1.10:3128/", }
二、自動登陸示例:
requests模擬登錄GitHub
import requests from bs4 import BeautifulSoup def login_github(): """ 經過requests模塊模擬瀏覽器登錄GitHub :return: """ # 獲取csrf_token r1 = requests.get('https://github.com/login') # 得到get請求的對象 s1 = BeautifulSoup(r1.text, 'html.parser') # 使用bs4解析HTML對象 token = s1.find('input', attrs={'name': 'authenticity_token'}).get('value') # 獲取登錄受權碼,即csrf_token get_cookies = r1.cookies.get_dict() # 獲取get請求的cookies,post請求時必須攜帶 # 發送post登錄請求 ''' post登錄參數 commit Sign+in utf8 ✓ authenticity_token E961jQMIyC9NPwL54YPj70gv2hbXWJ…fTUd+e4lT5RAizKbfzQo4eRHsfg== login JackUpDown(用戶名) password **********(密碼) ''' r2 = requests.post( 'https://github.com/session', data={ 'commit': 'Sign+in', 'utf8': '✓', 'authenticity_token': token, 'login': 'JackUpDown', 'password': '**********' }, cookies=get_cookies # 攜帶get請求的cookies ) login_cookies = r2.cookies.get_dict() # 得到登錄成功的cookies,攜帶此cookies就能夠訪問任意GitHub頁面 # 攜帶post cookies跳轉任意頁面 r3 = requests.get('https://github.com/settings/emails', cookies=login_cookies) print(r3.text)
登陸知乎
#!/usr/bin/env python # -*- coding:utf-8 -*- import time import requests from bs4 import BeautifulSoup session = requests.Session() i1 = session.get( url='https://www.zhihu.com/#signin', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup1 = BeautifulSoup(i1.text, 'lxml') xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'}) xsrf = xsrf_tag.get('value') current_time = time.time() i2 = session.get( url='https://www.zhihu.com/captcha.gif', params={'r': current_time, 'type': 'login'}, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', }) with open('zhihu.gif', 'wb') as f: f.write(i2.content) captcha = input('請打開zhihu.gif文件,查看並輸入驗證碼:') form_data = { "_xsrf": xsrf, 'password': 'xxooxxoo', "captcha": 'captcha', 'email': '424662508@qq.com' } i3 = session.post( url='https://www.zhihu.com/login/email', data=form_data, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) i4 = session.get( url='https://www.zhihu.com/settings/profile', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup4 = BeautifulSoup(i4.text, 'lxml') tag = soup4.find(id='rename-section') nick_name = tag.find('span',class_='name').string print(nick_name)