自從大一開始就想在博客專門創建一個分類,深度總結一下爬蟲技術相關的東西,可是一直拖到如今。大二立刻要結束了,準備從暑假開始認真總結,每一篇文章都爭取帶一個小案例。給學弟們做參考用~html
requests是一個做爲Python構建的優雅而簡單的HTTP庫。目前它使用了Apache2 Licensed許可證,requests在Python一些基本庫上進行了高度封裝。中文文檔:http://docs.python-requests.org/zh_CN/latest/html5
pip install requests
requests.get(url, params=None, **kwargs)
# 發送一個get請求到服務器端 # url接收一個URL地址 # parmas接收一個字典對象 # 返回一個請求對象
requests.options(url, **kwargs)
# 發送一個options請求到服務器端 # url接收一個URL地址
requests.head(url, **kwargs)
# 發送一個head請求到服務器端 # url接收一個URL地址
requests.post(url, data=None, json=None, **kwargs)
# 發送一個post請求到服務器端 # url接收一個URL地址 # data接收一個字典、字節或者是一個文件對象 # json接收一個json數據
requests.put(url, data=None, **kwargs)
# 發送一個put請求到服務器端 # url接收一個URL地址 # data接收一個字典、字節或者是一個文件對象
requests.patch(url, data=None, **kwargs)
# 發送一個patch請求到服務器端 # url接收一個URL地址 # data接收一個字典、字節或者是文件對象
requests.delete(url, **kwargs)
# 發送一個delete請求到服務器端 # url接收一個URL地址
requests.request(method, url, **kwargs)
# 發送一個請求 # method指定請求的方法 # url接收一個URL地址 # params接收一個字典、字節或者是文件對象 # data接收一個使用元組構成的列表[(key, value)]或者是字典、字節或者是文件對象 # json接收一個json數據 # headers接收一個字典,用於構成請求頭 # cookies接收一個cookie對象 # files接收一個文件對象 # auth接收一個元組,用來身份認證 # timeout接收一個浮點數或者是元組 # allow_redirects接收一個布爾值,默認是True,是否開啓重定向 # proxies 接收代理的url # verify 是否啓用安全認證 # stream 是否使用數據流的方式傳輸文件 # cert 使用證書文件,若是是pem文件,則(xxx.pem),若是是crt文件和key文件,則('xxx.crt', 'xxx.key')
# -*- coding: utf-8 -*- """ requests.api This module implements the Requests API. :copyright: (c) 2012 by Kenneth Reitz. :license: Apache2, see LICENSE for more details. """ from . import sessions def request(method, url, **kwargs): """Constructs and sends a :class:`Request <Request>`. :param method: method for the new :class:`Request` object. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary, list of tuples or bytes to send in the body of the :class:`Request`. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. :param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload. ``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')`` or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers to add for the file. :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth. :param timeout: (optional) How many seconds to wait for the server to send data before giving up, as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple. :type timeout: float or tuple :param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``. :type allow_redirects: bool :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. :param verify: (optional) Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use. Defaults to ``True``. :param stream: (optional) if ``False``, the response content will be immediately downloaded. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. :return: :class:`Response <Response>` object :rtype: requests.Response Usage:: >>> import requests >>> req = requests.request('GET', 'https://httpbin.org/get') <Response [200]> """ # By using the 'with' statement we are sure the session is closed, thus we # avoid leaving sockets open which can trigger a ResourceWarning in some # cases, and look like a memory leak in others. with sessions.Session() as session: return session.request(method=method, url=url, **kwargs) def get(url, params=None, **kwargs): r"""Sends a GET request. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary, list of tuples or bytes to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ kwargs.setdefault('allow_redirects', True) return request('get', url, params=params, **kwargs) def options(url, **kwargs): r"""Sends an OPTIONS request. :param url: URL for the new :class:`Request` object. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ kwargs.setdefault('allow_redirects', True) return request('options', url, **kwargs) def head(url, **kwargs): r"""Sends a HEAD request. :param url: URL for the new :class:`Request` object. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ kwargs.setdefault('allow_redirects', False) return request('head', url, **kwargs) def post(url, data=None, json=None, **kwargs): r"""Sends a POST request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('post', url, data=data, json=json, **kwargs) def put(url, data=None, **kwargs): r"""Sends a PUT request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('put', url, data=data, **kwargs) def patch(url, data=None, **kwargs): r"""Sends a PATCH request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('patch', url, data=data, **kwargs) def delete(url, **kwargs): r"""Sends a DELETE request. :param url: URL for the new :class:`Request` object. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('delete', url, **kwargs)
Beautiful Soup是一個用於從HTML和XML文件中提取數據的Python庫。它可以經過你喜歡的轉換器實現慣用的文檔導航,查找,修改文檔的方式.Beautiful Soup會幫你節省數小時甚至數天的工做時間。中文文檔:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/python
apt-get install Python-bs4
easy_install beautifulsoup4 pip install beautifulsoup4
下載地址:https://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/ 首先解壓下載的源碼壓縮包,進入源碼目錄,執行:python setup.py install
apt-get install Python-lxml easy_install lxml pip install lxml apt-get install Python-html5lib easy_install html5lib pip install html5lib
解析器 | 使用方法 | 優勢 | 缺點 |
---|---|---|---|
Python標準庫 | BeautifulSoup(markup, "html.parser") | Python的內置標準庫,執行速度適中, 文檔容錯能力強 | Python 2.7.3 or 3.2.2)前的版本中文檔容錯能力差 |
lxml HTML 解析器 | BeautifulSoup(markup, "lxml") | 速度快,文檔容錯能力強 | 須要安裝C語言庫 |
lxml XML 解析器 | BeautifulSoup(markup, ["lxml", "xml"]) BeautifulSoup(markup, "xml") | 速度快,惟一支持XML的解析器 | 須要安裝C語言庫 |
html5lib | BeautifulSoup(markup, "html5lib") | 最好的容錯性,以瀏覽器的方式解析文檔,生成HTML5格式的文檔 | 速度慢,不依賴外部擴展 |
從上表可知,推薦使用lxml解析器效率更高,可是xml或html文檔的格式不正確的話返回的結果可能不正確。git
from bs4 import BeautifulSoup soup = BeautifulSoup(open("index.html")) # 直接打開本地html文件 soup = BeautifulSoup("<html>data</html>") #傳入html文本
Beautiful Soup將HTML或XML文件轉換爲樹形結構,每一個節點都是Python對象。總共能夠分爲四種:github
標籤對象web
tag = soup.b
tag.name
tag.attrs
來操做屬性。可遍歷的字符串NavigableString對象正則表達式
unicode()
方法將其轉換爲Unicode字符串。unicode_string = unicode(tag.string)
replace_with()
方法被替換成爲其餘的字符串。BeautifulSoup對象json
soup.name
屬性的值是:u'[document]'
。註釋及特殊字符串Comment對象api
Comment 對象是一個特殊類型的 NavigableString 對象瀏覽器
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" soup = BeautifulSoup(markup) comment = soup.b.string type(comment) # <class 'bs4.element.Comment'> print(soup.b.prettify()) # 輸出成爲了特殊的格式 # <b> # <!--Hey, buddy. Want to buy a used parser?--> # </b>
屬性及方法名稱 | 釋義 |
---|---|
soup.head | 獲取<head></head> |
soup.title | 獲取<title></title> |
soup.TagName | 獲取< TagName></ TagName> |
soup.find_all(‘TagName’) | 獲取全部TagName的標籤 |
tag.contents | 將tag子節點以列表的方式輸出 |
tag.children | 返回一個tag子節點的可迭代生成器對象 |
tag.descendants | 屬性能夠對全部tag的子孫節點進行遞歸循環 |
tag.string | 獲取tag中的字符串內容 |
tag.strings | 循環獲取tag中的字符串內容 |
tag.stripped_strings | 功能相似於tag.strings ,可是具備除去多餘空白字符串的功能 |
tag.parent | 獲取父節點對象 |
tag.parents | 獲取父節點對象可迭代生成器對象 |
tag.next_sibling | 獲取下一個兄弟節點對象 |
tag.previous_sibling | 獲取上一個兄弟節點對象 |
tag.next_siblings | 獲取向下的全部兄弟節點的可迭代生成器對象 |
tag.previous_siblings | 獲取向上的全部兄弟節點的可迭代生成器對象 |
tag.next_element | 指向解析過程當中下一個被解析的對象 |
tag.previous_element | 指向解析過程當中上一個被解析的對象 |
tag.next_elements | 指向解析過程當中上面全部被解析對象的集合 |
tag.previous_elements | 指向解析過程當中下面被解析對象的集合 |
tag.find_all(‘TagName’) | 查找全部與TagName匹配的節點 |
tag.find_all([‘TagName1’, ‘TagName2’]) | 查找全部與列表中TagName 相匹配的節點 |
tag.find_all(True) | 返回全部能夠匹配的值 |
tag.find_all(FuncName) | 接收一個方法名稱,若是這個方法返回True表示當前的元素匹配而且找到 |
def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') soup.find_all(has_class_but_no_id) tag.find_all(Key=’Value) # 搜索全部Key的值是Value的標籤 soup.find_all(Key=re.compile("RegExp"), Key='Value') # 結合正則表達式使用而且是或的邏輯關係 tag.find_all(text=’xxx’) # 使用text參數能夠搜索文檔中的字符串內容 tag.find_all(text=[‘xxx’, ‘xxx’, ]) # text參數能夠接受字符串、正則、列表和布爾值 tag.find_all(‘TagName’, limit=Number) # 返回Number個符合的標籤 tag.find_all(‘TagName’, recursive=True/False) # 是否只匹配直接子節點 tag.find( name , attrs , recursive , text , **kwargs ) # 直接返回一個結果,匹配不到時返回None,而find_all()返回空列表[] # 相似的方法還有: tag.find_parents() tag.find_parent() tag.find_next_siblings() tag.find_next_sibling() tag.find_previous_siblings() tag.find_previous_sibling() tag.find_all_next() tag.find_next() tag.find_all_previous() tag.find_previous() # Beautiful Soup支持大部分的CSS選擇器,即tag.select(): tag.append(「Content」) # 向標籤中添加內容 tag.new_string() # 建立新的字符串對象 tag.new_tag() # 建立新的標籤對象 tag.insert() # 插入標籤對象 tag.insert_before() # 在tag標籤以前插入新的標籤對象 tag.insert_after() # 在tag標籤以後插入新的標籤對象 tag. clear() # 清除當前tag的內容 tag. extract() # 將當前的tag從文檔樹中刪除,而且返回該tag對象 tag. decompose() # 從當前的文檔樹中移除,而且徹底銷燬該tag對象 tag. replace_with() # 替換該tag對象 tag. wrap() # 用傳入的tag對象包裝指定的tag對象 tag. unwrap() # 取消使用上層tag對象的包裝,並返回被移除的上層tag對象 tag. prettify() # 將文檔樹格式化後使用Unicode編碼輸出 tag. get_text() # 獲取tag對象中的內容
# -*- coding:utf8 -*- import requests from bs4 import BeautifulSoup # 用戶名和密碼 username = 'xxx' password = 'xxx' # 請求頭 header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'github.com', 'Referer': "https://github.com/xvGe/xvGe.github.io", 'Upgrade-Insecure-Requests': '1', 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } # 登陸 response = requests.request('get', 'https://github.com/login', headers=header) soup = BeautifulSoup(response.text, features='lxml') # 獲取登陸token token = soup.find(name='input', attrs={'name': "authenticity_token"})['value'] # 獲取cookie cookie = response.cookies.get_dict() # 提交的登陸數據 formData = { 'commit': 'Sign in', 'utf8': '✓', 'authenticity_token': token, 'login': username, 'password': password, } # 提交登陸數據 response = requests.request('post', 'https://github.com/session', data=formData, cookies=cookie, headers=header) response.close()