爬蟲基本操做、requests和BeautifulSoup

1. 爬蟲基本操做

例如輿情繫統:
  獲取汽車之家新聞放到本身數據庫裏,建立本身的app,發佈內容,註明來源,本身創業。html

URL指定內容獲取到
    - 發送Http請求:http://www.autohome.com.cn/news/
    - 基於正則表達式獲取內容 

Python實現:python

import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')
response.text

obj = BeautifulSoup(response.text,...)
標籤對象 = obj.find('a') # 找到匹配成功的第一個標籤
標籤對象.find(...)

[標籤對象,標籤對象,]= obj.find_all('a') # 找到匹配成功的全部標籤

 

示例一:爬取汽車之家新聞git

requests
	
	obj = requests.get("url")
	obj.content
	obj.encoding = "gbk"
	obj.text
	
	
	soup = beautifulsoup(obj.text,'html.parser')
	標籤對象 = soup.find(name='xx')
	[標籤對象,標籤對象,] = soup.find_all(...)
	
	
	標籤對象.text
	標籤對象.attrs
	標籤對象.get(...)
	
import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')     # socket發送的是字節類型
# # print(response.text)    # 字符串,編碼設置不對出現亂碼
# print(response.content)     # response.content獲取的是字節類型
response.encoding = 'gbk'
# print(response.text)        # response.text拿到的是文本信息

# python有個內置解析器html.parser,html頁面的<html lang='en'...></html>對象經過html.parser解析出來
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(id='auto-channel-lazyload-article')
# h3 = tag.find(name='h3',class_='c1')     # name是標籤名。標籤名不能直接寫,class='c1'直接報錯,寫成class_='c1',或者寫成attrs={'class':'c1'}
# h3 = tag.find(name='h3',attrs={'class':'c1'})
h3 = tag.find(name='h3')
print(h3)
練習一:獲取一個新聞
response = requests.get('http://www.autohome.com.cn/news/')
response.encoding = 'gbk'
soup = BeautifulSoup(response.text,'html.parser')
li_list = soup.find(id='auto-channel-lazyload-article').find_all('li')    # find_all('li')默認爲find_all(name='li')
for li in li_list:
    # print(li.find('h3'))        # 有時候獲取到的li.find('h3')爲None
    title = li.find('h3')
    if not title:
        continue
    # print(title,type(title))    # <h3>將於第四季度上市 雲度π1正式下線</h3> <class 'bs4.element.Tag'>
    summary = li.find('p').text
    # url = li.find('a').attrs['href']    # li.find('a').attrs  # 獲取到li的全部屬性,是個字典.使用get也能夠獲取到url
    url = li.find('a').get('href')
    img = li.find('img').get('src')

    # # 下載img
    # res= requests.get(img)
    # file_name = '%s.jpg'%(title,)       # 標題看成下載的img文件名不符合規範,需修改
    # with open(file_name,'wb') as f:
    #     f.write(res.content)

    print(title.text, summary,url,img)  # 標題:title.text,簡介:summary
    print('=============')
練習二:找到全部新聞,其中包括標題,簡介,url,圖片

 

示例二:python代碼登陸githubgithub

1. 登陸頁面發送請求GET,獲取csrftoken
2. 發送POST請求:
  攜帶用戶名、密碼、csrftoken發送POST請求
  產生cookie,拿到後下次就不須要登陸了
requests
	
	obj = requests.get("url")
	obj.content
	obj.encoding = "gbk"
	obj.text
	obj.cookies.get_dict()
	
	
	requests.get("url",cookies={'k1':"v1"})
	
	
	soup = beatifulsoup(obj.text,'html.parser')
	標籤 = soup.find(name='xx')
	[標籤,] = soup.find_all(...)
	
	
	標籤.text
	標籤.attrs
	標籤.get(...)
import requests
from bs4 import BeautifulSoup

# 獲取token
r1 = requests.get('https://github.com/login')
s1 = BeautifulSoup(r1.text,'html.parser')
token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value')  # github登陸頁面攜帶的憑證不是csrf_token,憑證是authenticity_token
print(token)    # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==
r1_token_dict = r1.cookies.get_dict()

# 將用戶名、密碼、token以POST請求發送到服務端
# 測試下發送POST請求時,查看瀏覽器Network響應頭Headers發送請求的內容
"""
utf8:?
authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==
login:asdf
password:asdf
commit:Sign in
"""

r2 = requests.post(
    'http://github.com/session',    # POST發送的url是從瀏覽器Network響應頭Headers中查看獲取到的
    data={
        'utf8':'?',
        'authenticity_token':token,
        # 'login':'用戶名',
        'login':'317828332@qq.com',
        'password':'alex3714',
        # 'password':'密碼',
        'commit':'Sign in'
    },
    cookies = r1_token_dict
)
# print(r2.text)
r2_cookie_dict = r2.cookies.get_dict()
print(r1_token_dict)        # 有些網頁get請求時有cookies,有些沒有
#---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}
print(r2_cookie_dict)          # post請求時的cookies
#---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'}

#整合二個cookies
cookie_dict = {}
cookie_dict.update(r1_token_dict)
cookie_dict.update(r2_cookie_dict)

#再次發送請求時
r3 = requests.get(
    # url='xxxxxx',           #登陸後能夠訪問github的頁面
    url='https://github.com/settings/emails',
    cookies=cookie_dict
)
print(r3.text)
代碼實現

 

示例三:對抽屜新聞點贊正則表達式

# 1.登陸,拿到cookie
# 2.找到標籤url,看抽屜頁面發送的點贊請求,首先看往哪一個url發送請求。
# 發送的是post請求,發送的url地址:http://dig.chouti.com/login。返回的不是讓瀏覽器直接跳轉頁面,返回的是字典

import requests
from bs4 import BeautifulSoup
# 1.獲取cookie
r0 = requests.get('http://dig.chouti.com/')
r0_cookie_dict = r0.cookies.get_dict()

# 2.發送用戶名、密碼、cookie
r1 = requests.post(
    'http://dig.chouti.com/login',
    data={
        'phone':'8615131255089',
        'password':'woshiniba',
        'oneMonth':1    # 一個月免登陸
    },
    cookies=r0_cookie_dict
)
r1_cookie_dict = r1.cookies.get_dict()
print(r1.text)
#---> {"result":{"code":"8887", "message":"手機號格式不對", "data":""}}    #這是手機不對的狀況下打印的內容
print(r1.cookies.get_dict())
#---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'}

cookie_dict = {}
cookie_dict.update(r0_cookie_dict)
cookie_dict.update(r1_cookie_dict)

# cookie_dict={'gpsd':r0_cookie_dict['gpsd']}      # 同上面cookie_dict同樣,但不推薦使用

# 點贊
r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict)    # 點讚的時候是post請求,linksId=13911006是文章id
print(r2.text)
View Code

 

 

 

 

2. requests模塊

requests模塊中提供的方法數據庫

# requests.get()
# requests.post()
# requests.put()
# requests.request('post')

# requests.get(url, params=None, **kwargs)
# requests.post(url, data=None, json=None, **kwargs)
# requests.put(url, data=None, **kwargs)
# requests.head(url, **kwargs)
# requests.delete(url, **kwargs)
# requests.patch(url, data=None, **kwargs)
# requests.options(url, **kwargs)
#
# # 以上方法均是在此方法的基礎上構建
# requests.request(method, url, **kwargs)
調用關係
# url='xxx',
# params={'k1':'v1','nid':888},     #GET傳參
# cookies={},
# headers={},
# data = {},        # data提供數據
# json = {}         # json提供數據


# requests.get(
#     url='xxx',
#     params={'k1':'v1','nid':888},
#     cookies={},
#     headers={}
# )
# http://www.baidu.com?k1=v1&nid=888

requests.post(
    url='xxx',
    params={'k1':'v1','nid':888},
    cookies={},
    headers={},
    json={}
)

# 注意:向後臺發送去年請求時,注意請求頭

# requests.post(url='',data={})   # 默認攜帶請求頭application/x-www-form-urlencoded

requests.post(url='',data={},headers={'content-type':'application/json'})   # 這樣寫的話django經過request.POST拿不到值,只能經過request.boby中本身拿

requests.post(url='',json={})       # 默認攜帶請求頭headers={'content-type':'application/json'}
經常使用參數
# auth
def param_auth():
    from requests.auth import HTTPBasicAuth, HTTPDigestAuth     # HTTPBasicAuth基本上路由器都是經過HTTPBasicAuth驗證的
    # 簡單經常使用的基本驗證規則
    ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPBasicAuth驗證規則
    ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPDigestAuth驗證規則
    # 上面二種規則不會簡單的,爬蟲反爬不可能那麼簡單按照這二種規則驗證帳號密碼。
    print(ret.text)

    # ret = requests.get('http://192.168.1.1',)
    # auth=HTTPBasicAuth('admin', 'admin'))
    # ret.encoding = 'gbk'
    # print(ret.text)

    # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
    # print(ret)


# timeout   超時時間限制


# allow_redirects 容許重定向
# 假設訪問http://www.abc.com跳轉到http://www.baidu.com
response = requests.get('http://www.abc.com',allow_redirects=False)
print(response.text)        # 不容許重定向,則返回的是http://www.abc.com的內容

response = requests.get('http://www.abc.com',allow_redirects=True)
print(response.text)       # 返回的是http://www.baidu.com的內容


# proxies   代理,防止爬網頁時,把ip封了,加代理。能夠買代理,也能夠本身搭代理服務器,本身生成

# stream

# verify    證書,例如12306的證書。知乎證書可帶可不帶
requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem')  # stream=True須要攜帶證書,stream=False不須要攜帶證書
其餘參數

 

 

3. BeautifulSoup

beautifulsoup:把html結構化成對象,經過對象的方式取html內部元素django

#html_doc = 
#"""
# <html><head><title>The Dormouse's story</title></head>
# <body>
# asdf
#     <div class="title">
#         <b>The Dormouse's story總共</b>
#         <h1>f</h1>
#     </div>
# <div class="story">Once upon a time there were three little sisters; and their names were
#     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
#     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
#     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</div>
# ad<br/>sf
# <p class="story">...</p>
# </body>
# </html>
# """
#from bs4 import BeautifulSoup
#soup = BeautifulSoup(html_doc, features="lxml")		# 與BeautifulSoup(html_doc,'html.parser')不一樣的是使用的解析器不一樣,lxml性能更好,不過要安裝lxml模塊,推薦使用

#tag = soup.find(class_='story')
# print(tag)
# print(tag.name)
# #---> div
# # tag.name = 'span' # 設置
name屬性
# print(tag.attrs)
# #---> {'class': ['story']}
# tag.attrs['kkk'] = 'vvv'
# print(tag.attrs)
# #---> {'class': ['story'], 'kkk': 'vvv'}
# del tag.attrs['kkk']
# print(tag.attrs)
# #---> {'class': ['story']}
attrs屬性
# print(tag.children)
# #---> <list_iterator object at 0x0000000002EA32B0>
# print(list(tag.children))
# #---> ['Once upon a time there were three little sisters; and their names were\n    ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n    ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n    ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
# for item in tag.children:
#     print(type(item),item)
# # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were
#
#     # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>
#     # <class 'bs4.element.NavigableString'> ,
#     #
#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
#     # <class 'bs4.element.NavigableString'>  and
#     #
#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
#     # <class 'bs4.element.NavigableString'> ;
#     # and they lived at the bottom of a well.
chidren屬性
# print(tag)
# # ---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# tag.clear()
# print(tag)
# ---> <div class="story"></div>
clear屬性,清空,但保留標籤名
# tag.decompose()
# print(tag)
# #---> <None></None>
decompose,遞歸的刪除全部的標籤
# taga = tag.find(name='a')
# taga.extract()
# print(tag)
extract屬性,遞歸的刪除全部的標籤,並獲取刪除的標籤
# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>
decode 將標籤對象轉爲字符串類型.但decode_contents(不含當前標籤)
# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>
decode 將標籤對象轉爲字符串類型.但decode_contents(不含當前標籤)
# print(type(tag.encode()))
# # ---> <class 'bytes'>
# print(tag.encode())
# #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n    <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>'
# print(tag.encode_contents(),type(tag.encode_contents()))
encode,轉換爲字節(含當前標籤);encode_contents(不含當前標籤)
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')    # recursive遞歸找;text文本內容,不多用
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)
find,獲取匹配的第一個標籤
# tags = soup.find_all('a')
# print(tags)

# tags = soup.find_all('a',limit=1)     # limit=1只找一個
# print(tags)

# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
find_all,獲取匹配的全部標籤
# v = soup.find_all(name=['a','div'])       # name=['a','div'] 查找‘a’標籤和'div'標籤

# print(v)

# v = soup.find_all(class_=['sister0', 'sister'])   # class_=['sister0', 'sister']查找class='sister0'或者class='sister'
# print(v)

# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))


# v = soup.find_all(id=['link1','link2'])
# print(v)

# v = soup.find_all(href=['link1','link2'])
# print(v)
列表
#import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)

# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)

# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
正則
# def func(tag):
#     return tag.has_attr('class') and tag.has_attr('id')       # 返回結果爲True,就把結果給v = soup.find_all()
# v = soup.find_all(name=func)      # name=func把標籤遍歷一遍,每找到標籤執行一次函數。
# print(v)
方法篩選,不經常使用
# tag = soup.find('a')
# v = tag.get('id')
# print(v)
get,獲取標籤屬性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)
has_attr,檢查標籤是否具備該屬性
# tag = soup.find('a')
# v = tag.get_text()
# print(v)
get_text,獲取標籤內部文本內容
# tag = soup.find('body')
# v = tag.index(tag.find('div'))
# print(v)
# tag = soup.find('body')
# for i,v in enumerate(tag):
#     print(i,v)
index,檢查標籤在某標籤中的索引位置
is_empty_element,是不是空標籤(是否能夠是空)或者自閉合標籤
# soup.next             # 找下一個,無論是標籤仍是文本
# soup.next_element     # 找下一個標籤
# soup.next_elements
# soup.next_sibling     # 找兄弟姐妹
# soup.next_siblings

# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings

# tag.parent
# tag.parents
當前的關聯標籤
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)

# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)

# tag.find_parent(...)
# tag.find_parents(...)
# 參數同find_all
查找某標籤的關聯標籤
# soup.select("title")
#
# soup.select("p nth-of-type(3)")
#
# soup.select("body a")
#
# soup.select("html head title")
#
# tag = soup.select("span,a")
#
# soup.select("head > title")
#
# soup.select("p > a")
#
# soup.select("p > a:nth-of-type(2)")
#
# soup.select("p > #link1")
#
# soup.select("body > a")
#
# soup.select("#link1 ~ .sister")
#
# soup.select("#link1 + .sister")
#
# soup.select(".sister")
#
# soup.select("[class~=sister]")
#
# soup.select("#link1")
#
# soup.select("a#link2")
#
# soup.select('a[href]')
#
# soup.select('a[href="http://example.com/elsie"]')
#
# soup.select('a[href^="http://example.com/"]')
#
# soup.select('a[href$="tillie"]')
#
# soup.select('a[href*=".com/el"]')
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr('href'):
#             continue
#         yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
# print(type(tags), tags)
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr('href'):
#             continue
#         yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
# print(type(tags), tags)
select, select_one, CSS選擇器,select查找多個,select_one查找一個,可是參數類型不同
# tag = soup.find('span')
# print(tag.string)          # 獲取
# tag.string = 'new content' # 設置
# print(soup)

# tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx'            # tag.text不能修改標籤內容
# print(soup)

# tag = soup.find('body')
# v = tag.stripped_strings  # 遞歸內部獲取全部標籤的文本
# print(v)
標籤的內容
# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
# 若是實在想追加當前標籤已經存在的,方法以下
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)
append在當前標籤內部追加一個標籤,噹噹前內部標籤有追加的這個標籤時,只是把當前標籤內部位置被追加的標籤移動到最後
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)
insert在當前標籤內部指定位置插入一個標籤
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)
insert_after, insert_before在當前標籤後面或前面插入
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
replace_with 在當前標籤替換爲指定標籤
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
建立標籤之間的關係,關係建立完後沒什麼用,不會改變標籤間的位置
# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一個新來的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)

# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)
wrap,將指定標籤把當前標籤包裹起來
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
unwrap,去掉當前標籤,將保留其包裹的標籤
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
unwrap,去掉當前標籤,將保留其包裹的標籤
相關文章
相關標籤/搜索