pip install request-html
from request_html import HTMLSession session = HTMLSession url = 'https://www.baidu.com/' # get: r = session.get(url=url) # post: r = session.post(url=url) # request r = session.request(method='get'/'post', url=url)
from requests_html import HTMLSession session = HTMLSession() url = 'https://www.zhihu.com/' response = session.get(url=url)
print(response) print(type(response)) # <Response [200]> # <class 'requests_html.HTMLResponse'> print(dir(response)) #['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', '_from_response', '_html', '_next', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'html', 'is_permanent_redirect', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'next', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'session', 'status_code', 'text', 'url']
response.url: 當前路徑 response.text: 文本 response.encoding = 'gbk':編碼 response.content: 二進制的響應內容 response.json ==>json.loads(r.text) response.status_code: 返回狀態碼 response.headers: 返回響應頭 response.cookies: 返回cookies response.history: 返回響應歷史
print(response.html) # <HTML url='https://www.zhihu.com/signin?next=%2F'> In [18]: type(res.html) Out[18]: requests_html.HTML ''' reponse 和 response_html模塊本身實現的類 ''' print(dir(response.html)) ['__aiter__', '__anext__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_async_render', '_encoding', '_html', '_lxml', '_make_absolute', '_pq', 'absolute_links', 'add_next_symbol', 'arender', 'base_url', 'default_encoding', 'element', 'encoding', 'find', 'full_text', 'html', 'links', 'lxml', 'next', 'next_symbol', 'page', 'pq', 'raw_html', 'render', 'search', 'search_all', 'session', 'skip_anchors', 'text', 'url', 'xpath']
response.html.absolute_links: 絕對連接 response.links:相對連接 response.base_url: 基本路徑 response.html.html: 網頁源碼 response.html.text: 網頁文本 response.html.encoding = 'gbk' response.html.raw_html:頁面的二進制流
參數css
:param selector: css 選擇器 :param clean: 是否去除頁面中的<scpript>和<style>標籤,默認False :param containing:若是指定有值,只返回包含所給文本的Element對象,默認False :param first: 是否返回第一個對象,默認False :param _encoding: 字符編碼
返回結果html
[Element,Element……] 當First爲True的時候,只返回第一個Element
response.html.search(XXXX{}YYYY)[0] # 搜索一次 response.html.search(XXXX{name}YYY{pwd}ZZZ)[name] #只搜索一次
查找全部符合template對象,返回的是result對象組成的listpython
Element對象:json
'absolute_links', 'attrs', 'base_url', 'default_encoding', 'element', 'encoding', 'find', 'full_text', 'html', 'lineno', 'links', 'lxml', 'pq', 'raw_html', 'search', 'search_all', 'session', 'skip_anchors', 'tag', 'text', 'url', 'xpath'
去掉\r\n 以後的文本cookie
沒有去掉\r\n以後的文本值session
返回以字典形式Element對象的屬性和屬性名app