（第一篇）爬蟲技術專欄之requests模塊與BeautifulSoup模塊

時間 2019-11-07

標籤一篇爬蟲技術專欄 requests 模塊 beautifulsoup 欄目網絡爬蟲简体版

原文原文鏈接

自從大一開始就想在博客專門創建一個分類，深度總結一下爬蟲技術相關的東西，可是一直拖到如今。大二立刻要結束了，準備從暑假開始認真總結，每一篇文章都爭取帶一個小案例。給學弟們做參考用~html

0x01 requests模塊

requests是一個做爲Python構建的優雅而簡單的HTTP庫。目前它使用了Apache2 Licensed許可證，requests在Python一些基本庫上進行了高度封裝。中文文檔：http://docs.python-requests.org/zh_CN/latest/html5

0x001 安裝

pip install requests

0x002 經常使用方法

requests.get(url, params=None, **kwargs)

# 發送一個get請求到服務器端
# url接收一個URL地址
# parmas接收一個字典對象
# 返回一個請求對象

requests.options(url, **kwargs)

# 發送一個options請求到服務器端
# url接收一個URL地址

requests.head(url, **kwargs)

# 發送一個head請求到服務器端
# url接收一個URL地址

requests.post(url, data=None, json=None, **kwargs)

# 發送一個post請求到服務器端
# url接收一個URL地址
# data接收一個字典、字節或者是一個文件對象
# json接收一個json數據

requests.put(url, data=None, **kwargs)

# 發送一個put請求到服務器端
# url接收一個URL地址
# data接收一個字典、字節或者是一個文件對象

requests.patch(url, data=None, **kwargs)

# 發送一個patch請求到服務器端
# url接收一個URL地址
# data接收一個字典、字節或者是文件對象

requests.delete(url, **kwargs)

# 發送一個delete請求到服務器端
# url接收一個URL地址

requests.request(method, url, **kwargs)

# 發送一個請求
# method指定請求的方法
# url接收一個URL地址
# params接收一個字典、字節或者是文件對象
# data接收一個使用元組構成的列表[(key, value)]或者是字典、字節或者是文件對象
# json接收一個json數據
# headers接收一個字典，用於構成請求頭
# cookies接收一個cookie對象
# files接收一個文件對象
# auth接收一個元組，用來身份認證
# timeout接收一個浮點數或者是元組
# allow_redirects接收一個布爾值，默認是True，是否開啓重定向
# proxies 接收代理的url
# verify 是否啓用安全認證
# stream 是否使用數據流的方式傳輸文件
# cert 使用證書文件，若是是pem文件，則(xxx.pem)，若是是crt文件和key文件，則('xxx.crt', 'xxx.key')

0x003 requests.api源碼

# -*- coding: utf-8 -*-

"""
requests.api

This module implements the Requests API.

:copyright: (c) 2012 by Kenneth Reitz.
:license: Apache2, see LICENSE for more details.
"""

from . import sessions

def request(method, url, **kwargs):
    """Constructs and sends a :class:`Request <Request>`.

    :param method: method for the new :class:`Request` object.
    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the body of the :class:`Request`.
    :param data: (optional) Dictionary, list of tuples, bytes, or file-like
        object to send in the body of the :class:`Request`.
    :param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`.
    :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
    :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
    :param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload.
        ``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')``
        or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string
        defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers
        to add for the file.
    :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
    :param timeout: (optional) How many seconds to wait for the server to send data
        before giving up, as a float, or a :ref:`(connect timeout, read
        timeout) <timeouts>` tuple.
    :type timeout: float or tuple
    :param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``.
    :type allow_redirects: bool
    :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
    :param verify: (optional) Either a boolean, in which case it controls whether we verify
            the server's TLS certificate, or a string, in which case it must be a path
            to a CA bundle to use. Defaults to ``True``.
    :param stream: (optional) if ``False``, the response content will be immediately downloaded.
    :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response

    Usage::

      >>> import requests
      >>> req = requests.request('GET', 'https://httpbin.org/get')
      <Response [200]>
    """

    # By using the 'with' statement we are sure the session is closed, thus we
    # avoid leaving sockets open which can trigger a ResourceWarning in some
    # cases, and look like a memory leak in others.
    with sessions.Session() as session:
        return session.request(method=method, url=url, **kwargs)

def get(url, params=None, **kwargs):
    r"""Sends a GET request.

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the body of the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    kwargs.setdefault('allow_redirects', True)
    return request('get', url, params=params, **kwargs)

def options(url, **kwargs):
    r"""Sends an OPTIONS request.

    :param url: URL for the new :class:`Request` object.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    kwargs.setdefault('allow_redirects', True)
    return request('options', url, **kwargs)

def head(url, **kwargs):
    r"""Sends a HEAD request.

    :param url: URL for the new :class:`Request` object.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    kwargs.setdefault('allow_redirects', False)
    return request('head', url, **kwargs)

def post(url, data=None, json=None, **kwargs):
    r"""Sends a POST request.

    :param url: URL for the new :class:`Request` object.
    :param data: (optional) Dictionary, list of tuples, bytes, or file-like
        object to send in the body of the :class:`Request`.
    :param json: (optional) json data to send in the body of the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    return request('post', url, data=data, json=json, **kwargs)

def put(url, data=None, **kwargs):
    r"""Sends a PUT request.

    :param url: URL for the new :class:`Request` object.
    :param data: (optional) Dictionary, list of tuples, bytes, or file-like
        object to send in the body of the :class:`Request`.
    :param json: (optional) json data to send in the body of the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    return request('put', url, data=data, **kwargs)

def patch(url, data=None, **kwargs):
    r"""Sends a PATCH request.

    :param url: URL for the new :class:`Request` object.
    :param data: (optional) Dictionary, list of tuples, bytes, or file-like
        object to send in the body of the :class:`Request`.
    :param json: (optional) json data to send in the body of the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    return request('patch', url, data=data, **kwargs)

def delete(url, **kwargs):
    r"""Sends a DELETE request.

    :param url: URL for the new :class:`Request` object.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """

    return request('delete', url, **kwargs)

0x02 BeautifulSoup模塊

Beautiful Soup是一個用於從HTML和XML文件中提取數據的Python庫。它可以經過你喜歡的轉換器實現慣用的文檔導航,查找,修改文檔的方式.Beautiful Soup會幫你節省數小時甚至數天的工做時間。中文文檔：https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/python

0x001安裝

Debain或Ubuntu

apt-get install Python-bs4

easy_install和pip安裝，兼容Python2.x和Python3.x

easy_install beautifulsoup4
pip install beautifulsoup4

源碼安裝

下載地址：https://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/
首先解壓下載的源碼壓縮包，進入源碼目錄，執行：python setup.py install

安裝解析器lxml和html5lib

apt-get install Python-lxml
easy_install lxml
pip install lxml
apt-get install Python-html5lib
easy_install html5lib
pip install html5lib

解析器	使用方法	優勢	缺點
Python標準庫	BeautifulSoup(markup, "html.parser")	Python的內置標準庫，執行速度適中, 文檔容錯能力強	Python 2.7.3 or 3.2.2)前的版本中文檔容錯能力差
lxml HTML 解析器	BeautifulSoup(markup, "lxml")	速度快，文檔容錯能力強	須要安裝C語言庫
lxml XML 解析器	BeautifulSoup(markup, ["lxml", "xml"]) BeautifulSoup(markup, "xml")	速度快，惟一支持XML的解析器	須要安裝C語言庫
html5lib	BeautifulSoup(markup, "html5lib")	最好的容錯性，以瀏覽器的方式解析文檔，生成HTML5格式的文檔	速度慢，不依賴外部擴展

從上表可知，推薦使用lxml解析器效率更高，可是xml或html文檔的格式不正確的話返回的結果可能不正確。git

from bs4 import BeautifulSoup
soup = BeautifulSoup(open("index.html"))  # 直接打開本地html文件
soup = BeautifulSoup("<html>data</html>")  #傳入html文本

0x002 經常使用對象介紹

Beautiful Soup將HTML或XML文件轉換爲樹形結構，每一個節點都是Python對象。總共能夠分爲四種：github

標籤對象web
1. Tag對象與原生的HTML或XML對象相同。tag = soup.b
2. Name是Tag的名字。tag.name
3. Attrs, Tag的屬性是個列表，可使用tag[‘class’]的方式操做屬性，也可使用tag.attrs來操做屬性。
可遍歷的字符串NavigableString對象正則表達式
1. 因爲字符串包含在了Tag內，因此Beautiful Soup用 NavigableString 類來包裝tag中的字符串。
2. tag.string：它的類型是BS的字符串，能夠經過unicode()方法將其轉換爲Unicode字符串。unicode_string = unicode(tag.string)
3. BS的Tag中包含的字符串不能夠被編輯，可是能夠經過replace_with()方法被替換成爲其餘的字符串。
BeautifulSoup對象json
1. 該對象表示的所有的內容。其soup.name屬性的值是：u'[document]'。
註釋及特殊字符串Comment對象api
1. Comment 對象是一個特殊類型的 NavigableString 對象瀏覽器
```
markup = "<b></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# <class 'bs4.element.Comment'>
print(soup.b.prettify())   # 輸出成爲了特殊的格式
# <b>
#  
# </b>
```
2. Beautiful Soup定義的其餘類型可能會出如今XML文檔中，CData、ProcessingInstruction、Declaration、Doctype，這些類型與Comment相似，都是NavigableString的子類，只是添加了一些特殊的方法。

0x003 經常使用屬性和方法

屬性及方法名稱	釋義
soup.head	獲取`<head></head>`
soup.title	獲取`<title></title>`
soup.TagName	獲取< TagName></ TagName>
soup.find_all(‘TagName’)	獲取全部TagName的標籤
tag.contents	將tag子節點以列表的方式輸出
tag.children	返回一個tag子節點的可迭代生成器對象
tag.descendants	屬性能夠對全部tag的子孫節點進行遞歸循環
tag.string	獲取tag中的字符串內容
tag.strings	循環獲取tag中的字符串內容
tag.stripped_strings	功能相似於`tag.strings`，可是具備除去多餘空白字符串的功能
tag.parent	獲取父節點對象
tag.parents	獲取父節點對象可迭代生成器對象
tag.next_sibling	獲取下一個兄弟節點對象
tag.previous_sibling	獲取上一個兄弟節點對象
tag.next_siblings	獲取向下的全部兄弟節點的可迭代生成器對象
tag.previous_siblings	獲取向上的全部兄弟節點的可迭代生成器對象
tag.next_element	指向解析過程當中下一個被解析的對象
tag.previous_element	指向解析過程當中上一個被解析的對象
tag.next_elements	指向解析過程當中上面全部被解析對象的集合
tag.previous_elements	指向解析過程當中下面被解析對象的集合
tag.find_all(‘TagName’)	查找全部與TagName匹配的節點
tag.find_all([‘TagName1’, ‘TagName2’])	查找全部與列表中`TagName`相匹配的節點
tag.find_all(True)	返回全部能夠匹配的值
tag.find_all(FuncName)	接收一個方法名稱，若是這個方法返回True表示當前的元素匹配而且找到

0x004 官方示例

def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

soup.find_all(has_class_but_no_id)
tag.find_all(Key=’Value)                               # 搜索全部Key的值是Value的標籤
soup.find_all(Key=re.compile("RegExp"), Key='Value')   # 結合正則表達式使用而且是或的邏輯關係
tag.find_all(text=’xxx’)                               # 使用text參數能夠搜索文檔中的字符串內容
tag.find_all(text=[‘xxx’, ‘xxx’, ])                    # text參數能夠接受字符串、正則、列表和布爾值
tag.find_all(‘TagName’, limit=Number)                  # 返回Number個符合的標籤
tag.find_all(‘TagName’, recursive=True/False)          # 是否只匹配直接子節點
tag.find( name , attrs , recursive , text , **kwargs ) # 直接返回一個結果，匹配不到時返回None，而find_all()返回空列表[]
# 相似的方法還有：
tag.find_parents()
tag.find_parent()
tag.find_next_siblings()
tag.find_next_sibling()
tag.find_previous_siblings()
tag.find_previous_sibling()
tag.find_all_next()
tag.find_next()
tag.find_all_previous()
tag.find_previous()
# Beautiful Soup支持大部分的CSS選擇器，即tag.select():
tag.append(「Content」)                                 # 向標籤中添加內容
tag.new_string()                                      # 建立新的字符串對象
tag.new_tag()                                         # 建立新的標籤對象
tag.insert()                                          # 插入標籤對象
tag.insert_before()                                   # 在tag標籤以前插入新的標籤對象
tag.insert_after()                                    # 在tag標籤以後插入新的標籤對象
tag. clear()                                          # 清除當前tag的內容
tag. extract()                                        # 將當前的tag從文檔樹中刪除，而且返回該tag對象
tag. decompose()                                      # 從當前的文檔樹中移除，而且徹底銷燬該tag對象
tag. replace_with()                                   # 替換該tag對象
tag. wrap()                                           # 用傳入的tag對象包裝指定的tag對象
tag. unwrap()                                         # 取消使用上層tag對象的包裝，並返回被移除的上層tag對象
tag. prettify()                                       # 將文檔樹格式化後使用Unicode編碼輸出
tag. get_text()                                       # 獲取tag對象中的內容

0x005 自動登陸GitHub

# -*- coding:utf8 -*-

import requests
from bs4 import BeautifulSoup

# 用戶名和密碼
username = 'xxx'
password = 'xxx'

# 請求頭
header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Host': 'github.com',
    'Referer': "https://github.com/xvGe/xvGe.github.io",
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
}

# 登陸
response = requests.request('get', 'https://github.com/login', headers=header)
soup = BeautifulSoup(response.text, features='lxml')

# 獲取登陸token
token = soup.find(name='input', attrs={'name': "authenticity_token"})['value']

# 獲取cookie
cookie = response.cookies.get_dict()

# 提交的登陸數據
formData = {
    'commit': 'Sign in',
    'utf8': '✓',
    'authenticity_token': token,
    'login': username,
    'password': password,

}

# 提交登陸數據
response = requests.request('post', 'https://github.com/session', data=formData, cookies=cookie, headers=header)

response.close()