python之requests模塊快速上手

時間 2019-11-07

原文原文鏈接

安裝

pip3 install requests

使用

發送請求

import requests

r = requests.get('http://www.baidu.com')

還能夠以下方式發送不一樣類型請求：html

r = requests.post('http://httpbin.org/post', data = {'key':'value'}) 
r = requests.put('http://httpbin.org/put', data = {'key':'value'})
r = requests.delete('http://httpbin.org/delete')
r = requests.head('http://httpbin.org/get')
r = requests.options('http://httpbin.org/get')

傳遞URL參數

import requests

# 傳遞字典
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.get("http://httpbin.org/get", params=payload)
print(r.url)  # http://httpbin.org/get?key1=value1&key2=value2

# 傳遞字典套列表
payload = {'key1': 'value1', 'key2': ['value2', 'value3']}
r = requests.get('http://httpbin.org/get', params=payload)
print(r.url)  # http://httpbin.org/get?key1=value1&key2=value2&key2=value3

響應文本內容

import requests

r = requests.get('https://www.baidu.com')
print(r.encoding)  # ISO-8859-1 查看編碼
r.encoding = 'utf8'  # 設置編碼
print(r.text)

二進制響應內容

import requests

r = requests.get('https://assets.readthedocs.org/sustainability/jetbrains/pycharm3-fs8.png')
file_name = r.url.rsplit('/', maxsplit=1)[1]
with open(file_name, 'wb') as img_file:
    img_file.write(r.content)

JSON響應內容

import requests

# 若是響應內容是 JSON 格式，就能夠直接經過r.json()將 json 數據轉換爲字典
r = requests.get('https://api.github.com/events')
print(r.json())

定製請求頭

import requests

url = 'https://api.github.com/some/endpoint'
headers = {'user-agent': 'my-app/0.0.1'}

r = requests.get(url, headers=headers)

複雜POST請求

import requests

# 傳遞字典
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", data=payload)

'''
  {
    "key2": "value2",
    "key1": "value1"
  }
'''
# 傳遞元組
payload = (('key1', 'value1'), ('key1', 'value2'))
r = requests.post('http://httpbin.org/post', data=payload)
'''
  {
    "key1": [
      "value1", 
      "value2"
    ]
  }
'''

# 傳遞 JSON
import json

url = 'http://127.0.0.1:5000/'
payload = {'some': 'data'}
r = requests.post(url, data=json.dumps(payload))
# 此處除了能夠自行對 dict 進行編碼，你還可使用 json 參數直接傳遞，而後它就會被自動編碼
r = requests.post(url, json=payload)

POST一個多部分編碼(Multipart-Encoded)的文件

import requests

# 上傳文件
url = 'http://httpbin.org/post'
files = {'file': open('report.xls', 'rb')}
r = requests.post(url, files=files)

# 顯式地設置文件名，文件類型和請求頭
url = 'http://httpbin.org/post'
files = {'file': ('report.xls', open('report.xls', 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})}
r = requests.post(url, files=files)

# 發送做爲文件來接收的字符串
url = 'http://httpbin.org/post'
files = {'file': ('report.csv', 'some,data,to,send\nanother,row,to,send\n')}
r = requests.post(url, files=files)

響應狀態碼

import requests

r = requests.get('http://www.baidu.com')
# 爲方便引用，Requests 還附帶了一個內置的狀態碼查詢對象：
print(r.status_code == requests.codes.ok)

# 若是發送了一個錯誤請求(一個 4XX 客戶端錯誤，或者 5XX 服務器錯誤響應)，咱們能夠經過 Response.raise_for_status() 來拋出異常
bad_r = requests.get('http://httpbin.org/status/500')
print(bad_r.status_code)  # 500
bad_r.raise_for_status()  # requests.exceptions.HTTPError: 500 Server Error

響應頭

import requests

r = requests.get('http://www.baidu.com')
# 以查看以一個 Python 字典形式展現的服務器響應頭：
print(r.headers)
'''
{
    'Server': 'bfe/1.0.8.18',
    'Date': 'Tue, 25 Dec 2018 06:41:43 GMT',
    'Content-Type': 'text/html',
    'Last-Modified': 'Mon, 23 Jan 2017 13:28:11 GMT',
    'Transfer-Encoding': 'chunked',
    'Connection': 'Keep-Alive',
    'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform',
    'Pragma': 'no-cache',
    'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/',
    'Content-Encoding': 'gzip'
}
'''
# HTTP 頭部是大小寫不敏感的。所以，咱們可使用任意大小寫形式來訪問這些響應頭字段
print(r.headers['Content-Type'])  # text/html
print(r.headers.get('content-type'))  # text/html

Cookie

import requests

# 若是某個響應中包含一些 cookie，你能夠快速訪問它們
url = 'http://example.com/some/cookie/setting/url'
r = requests.get(url)
print(r.cookies['example_cookie_name'])

# 要想發送你的cookies到服務器，可使用 cookies 參數
url = 'http://httpbin.org/cookies'
cookies = dict(cookies_are='working')
r = requests.get(url, cookies=cookies)

# Cookie 的返回對象爲 RequestsCookieJar，它的行爲和字典相似，但接口更爲完整，適合跨域名跨路徑使用。你還能夠把 Cookie Jar 傳到 Requests 中
jar = requests.cookies.RequestsCookieJar()
jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
url = 'http://httpbin.org/cookies'
r = requests.get(url, cookies=jar)

重定向與請求歷史

import requests

# 默認狀況下，除了 HEAD, Requests 會自動處理全部重定向。
# 可使用響應對象的 history 方法來追蹤重定向。
# Response.history 是一個 Response 對象的列表，爲了完成請求而建立了這些對象。這個對象列表按照從最老到最近的請求進行排序。
# 例如，Github 將全部的 HTTP 請求重定向到 HTTPS：
r = requests.get('http://github.com')
print(r.url)  # https://github.com/
print(r.history[0].url)  # http://github.com/

# 若是你使用的是 GET、OPTIONS、POST、PUT、PATCH 或者 DELETE，那麼你能夠經過 allow_redirects 參數禁用重定向處理：
r = requests.get('http://github.com', allow_redirects=False)
print(r.url)  # http://github.com/
print(r.history)  # []

# 若是你使用了 HEAD，你也能夠啓用重定向：
r = requests.head('http://github.com', allow_redirects=True)

超時

import requests

# 能夠告訴 requests 在通過以 timeout 參數設定的秒數時間以後中止等待響應：
requests.get('http://github.com', timeout=0.001)
'''
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
requests.exceptions.Timeout: HTTPConnectionPool(host='github.com', port=80): Request timed out. (timeout=0.001)
'''

錯誤與異常

遇到網絡問題（如：DNS 查詢失敗、拒絕鏈接等）時，Requests 會拋出一個 ConnectionError 異常。python

若是 HTTP 請求返回了不成功的狀態碼， Response.raise_for_status() 會拋出一個 HTTPError 異常。git

若請求超時，則拋出一個 Timeout 異常。github

若請求超過了設定的最大重定向次數，則會拋出一個 TooManyRedirects 異常。json

全部 Requests 顯式拋出的異常都繼承自 requests.exceptions.RequestException 。api

參考：http://docs.python-requests.org/zh_CN/latest/index.html跨域

示例

requests和bs4爬取汽車之家文章和圖片

from gevent import monkey

monkey.patch_all()
import gevent
import requests
import bs4


def save_detail(title, url_txt):
    article_detail_text = requests.get(url_txt).text
    detail_soup = bs4.BeautifulSoup(article_detail_text, features="html.parser")
    div_content = detail_soup.find(name='div', id='articleContent')
    p_img_list = div_content.find_all(name='p', attrs={'align': 'center'})
    for p_img in p_img_list:
        if p_img.find(name='img') != None and p_img.find(name='img').attrs != None:
            src = 'https:' + p_img.find(name='img').attrs.get('src')
            filename = src.rsplit('/', maxsplit=1)[1]
            img_resp = requests.get(src)
            with open('images/%s' % filename, 'wb') as f:
                f.write(img_resp.content)
    line_arr = []
    if (div_content != None):
        print(div_content.text)
        for line in div_content.find_all(name='p', align=False, recursive=False):
            line_arr.append(line.text)
        content = '\r\n'.join(line_arr)
        f = open('auto_home_articles/%s.txt' % title.replace('/', ' '), 'w+', encoding="utf-8")
        f.write(content)
        f.close()


def get_list_url():
    response = requests.get('https://www.autohome.com.cn/news/')
    response.encoding = 'gbk'
    list_soup = bs4.BeautifulSoup(response.text, features="html.parser")
    # 獲取最大頁數
    ul_page = list_soup.find(name='div', id='channelPage', class_='page')
    max_page_num = int(ul_page.find(class_='page-item-next').find_previous(name='a').text)

    list_url_template = 'https://www.autohome.com.cn/news/%s/#liststart'
    return [list_url_template % i for i in range(1, max_page_num + 1)]


def start_save(list_url):
    list_page_resp = requests.get(list_url)
    list_page_resp.encoding = 'gbk'
    list_page_soup = bs4.BeautifulSoup(list_page_resp.text, features="html.parser")
    div = list_page_soup.find(name='div', id='auto-channel-lazyload-article')
    article_ul = div.find_all(name='ul', attrs={'class': 'article'})
    detail_url_list = []
    for article_list in article_ul:
        article_group = article_list.find_all(name='a')
        for article_item in article_group:
            title = article_item.find(name='h3').text
            url = article_item.attrs.get('href').replace('//', 'http://')
            detail_url_list.append((title, url,))
    gevent.joinall([gevent.spawn(save_detail, detail_url[0], detail_url[1]) for detail_url in detail_url_list])


[start_save(list_url) for list_url in iter(get_list_url())]