requsets模塊和beautifulsoup模塊

2.requests模塊方法

requests是基於Python開發的HTTP庫,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。git

  • request.get()
  • request.post()
  • request.put()github

  • 以上方法均是在此方法的基礎上構建requests.request(method, url, **kwargs)
    • method 包括 psot、get、put等等
    • **kwargs 包括經常使用參數
      • url = ‘’,
      • params = {'k1':'v1','k2','v2'}, # get方法僅限的參數傳遞方式
      • cookies = {},
      • headers = {}, # 請求頭
      • data = {}, # post等請求參數傳遞
      • json = {}, # json數據參數

2.1 requests.get

requests.get(
    url='xxx',
    params={'k1':'v1','nid':888},
    cookies={},
    headers={},
)

# http://www.baidu.com?k1=v2&nid=888

2.2 requests.post

requests.post(
    url='xxx',
    params={'k1':'v1','nid':888},
    cookies={},
    
    # data
    headers={'content-type': 'application/x-www-form-urlencoded'},
    data={},
    
    # json
    # headers={'content-type': 'application/json'},
    # json={}
)

其餘參數

auth身份驗證json

def param_auth():
    from requests.auth import HTTPBasicAuth, HTTPDigestAuth

    ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
    print(ret.text)

    # ret = requests.get('http://192.168.1.1',
    # auth=HTTPBasicAuth('admin', 'admin'))
    # ret.encoding = 'gbk'
    # print(ret.text)

    # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
    # print(ret)

allow_redirects 重定向(控制是否url跳轉)api

def param_allow_redirects():
    ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
    print(ret.text)

stream 流 (true相應內容按流式下載)瀏覽器

def param_stream():
    ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
    print(ret.content)
    ret.close()

    # from contextlib import closing
    # with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
    # # 在此處理響應。
    # for i in r.iter_content():
    # print(i)

cert 是否攜帶證書(證書名)cookie

requests.get('http://httpbin.org/get',cert="xxxx.pem")

session

此處的session並不是以前所學session,
此處的session,是個容器,攜帶全部請求頭、體等等,
因此,咱們每次requests請求時,都須要cookies等手動添加請求中,
利用session能夠自動攜帶cookies、session等信息發送請求session

session = requests.Session()
session.post(url,data={}) # 省略cookies=cookie

# response_ = requests.post(url,data={},cookies=cookie)

3.BeautifulSoup模塊方法

BeautifulSoup是一個模塊,該模塊用於接收一個HTML或XML字符串,而後將其進行格式化,以後遍能夠使用他提供的方法進行快速查找指定元素,從而使得在HTML或XML中查找指定元素變得簡單。app

from bs4.Element import Tagpost

1,name 標籤名url

# tag = soup.find('a')
# name = tag.name # 獲取
# print(name)
# tag.name = 'span' # 設置
# print(soup)

2,attr 屬性標籤

# tag = soup.find('a')
# attrs = tag.attrs    # 獲取
# print(attrs)
# tag.attrs = {'ik':123} # 設置
# tag.attrs['id'] = 'iiiii' # 設置
# print(soup)

3,children,全部子標籤

# body = soup.find('body')
# v = body.children

4, children,全部後代標籤

# body = soup.find('body')
# v = body.descendants

5, clear,將標籤的全部子標籤所有清空(保留標籤名)

# tag = soup.find('body')
# tag.clear()
# print(soup)

10, find,獲取匹配的第一個標籤

# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)

11, find_all,獲取匹配的全部標籤

# tags = soup.find_all('a')
# print(tags)
 
# tags = soup.find_all('a',limit=1)
# print(tags)
 
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
 
 
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
 
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)
 
# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))
 
 
# v = soup.find_all(id=['link1','link2'])
# print(v)
 
# v = soup.find_all(href=['link1','link2'])
# print(v)
 
# ####### 正則 #######
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
 
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
 
# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
 
# ####### 方法篩選 #######
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
# v = soup.find_all(name=func)
# print(v)
 
 
# ## get,獲取標籤屬性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)

12, has_attr,檢查標籤是否具備該屬性

# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)

13, get_text,獲取標籤內部文本內容

# tag = soup.find('a')
# v = tag.get_text('id')
# print(v)

16, 當前的關聯標籤

# soup.next
# soup.next_element
# soup.next_elements
# soup.next_sibling
# soup.next_siblings
 
#
# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings
 
#
# tag.parent
# tag.parents

17, 查找某標籤的關聯標籤

# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)
 
# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)
 
# tag.find_parent(...)
# tag.find_parents(...)
 
# 參數同find_all

20, append在當前標籤內部追加一個標籤

# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
#
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)

21, insert在當前標籤內部指定位置插入一個標籤

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)

22, insert_after,insert_before 在當前標籤後面或前面插入

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)

23, replace_with 在當前標籤替換爲指定標籤

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)

24, 建立標籤之間的關係

# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
相關文章
相關標籤/搜索