BeautifulSoup

 

#pip3 install beautifulsoup4

from bs4 import BeautifulSoup
soup = BeautifulSoup(comment,"html.parser") #html.parser 是一個內置的解析器,BeautifulSoup會根據html.parser把html解析爲一個個對象
comment ="""
    <p id="i1">
        我是中國人
    </p>
    <p >
       <script>alert(123)</script>
    </p>
    <p id="i2">
        <span>我是中國人</span>
    </p>
    <p>
        <br />
    </p>
    <p id="i3">
        <span>我是中國人</span><img src="/static/images/1.jpg" alt="" />
    </p>

"""

#pip3 install beautifulsoup4

from bs4 import BeautifulSoup
soup = BeautifulSoup(comment,"html.parser") #html.parser 是一個內置的解析器,BeautifulSoup會根據html.parser把html解析爲一個個對象

# tag = soup.find(name="span")    #找第一個標籤
# print(tag)

# obj = soup.find(attrs={"id":"i2"})  #查找屬性 查找第一個
# print(obj)

# obj = soup.find(name="p",attrs={"id":"i2"})  #而且
# print(obj)


# obj = soup.find_all(name="p") #查找屬性 查找全部
# print(obj)



#查找全部內容,匹配到的清空內容,不刪除標籤clear()
# valid_tag = ["p","img","div"]
#
# tags = soup.find_all()
# for tag in tags:
#     if tag.name not in valid_tag:
#         tag.clear()
# print(soup)


#查找全部內容,匹配到的刪除標籤
# valid_tag = ["p","img","div"]
#
# tags = soup.find_all()
# for tag in tags:
#     if tag.name not in valid_tag:
#         tag.decompose()
# print(soup)

#取到的soup是對象,轉換成字符串
# print(soup.decode())


#限制某個標籤的屬性,不在的屬性從標籤中刪除
valid_tag = {
    "p":["class","id"],
    "img":["src"],
    "div":["class"],
}

tags = soup.find_all()
for tag in tags:
    if tag.name not in valid_tag:
        tag.decompose()
    if tag.attrs:
        #print(tag.attrs)        #獲取全部標籤的屬性
        for k in list(tag.attrs.keys()):
            if k not in valid_tag[tag.name]:
                del tag.attrs[k]
content_str = soup.decode()
print(content_str)
過濾演示.py
相關文章
相關標籤/搜索