#pip3 install beautifulsoup4 from bs4 import BeautifulSoup soup = BeautifulSoup(comment,"html.parser") #html.parser 是一個內置的解析器,BeautifulSoup會根據html.parser把html解析爲一個個對象
comment =""" <p id="i1"> 我是中國人 </p> <p > <script>alert(123)</script> </p> <p id="i2"> <span>我是中國人</span> </p> <p> <br /> </p> <p id="i3"> <span>我是中國人</span><img src="/static/images/1.jpg" alt="" /> </p> """ #pip3 install beautifulsoup4 from bs4 import BeautifulSoup soup = BeautifulSoup(comment,"html.parser") #html.parser 是一個內置的解析器,BeautifulSoup會根據html.parser把html解析爲一個個對象 # tag = soup.find(name="span") #找第一個標籤 # print(tag) # obj = soup.find(attrs={"id":"i2"}) #查找屬性 查找第一個 # print(obj) # obj = soup.find(name="p",attrs={"id":"i2"}) #而且 # print(obj) # obj = soup.find_all(name="p") #查找屬性 查找全部 # print(obj) #查找全部內容,匹配到的清空內容,不刪除標籤clear() # valid_tag = ["p","img","div"] # # tags = soup.find_all() # for tag in tags: # if tag.name not in valid_tag: # tag.clear() # print(soup) #查找全部內容,匹配到的刪除標籤 # valid_tag = ["p","img","div"] # # tags = soup.find_all() # for tag in tags: # if tag.name not in valid_tag: # tag.decompose() # print(soup) #取到的soup是對象,轉換成字符串 # print(soup.decode()) #限制某個標籤的屬性,不在的屬性從標籤中刪除 valid_tag = { "p":["class","id"], "img":["src"], "div":["class"], } tags = soup.find_all() for tag in tags: if tag.name not in valid_tag: tag.decompose() if tag.attrs: #print(tag.attrs) #獲取全部標籤的屬性 for k in list(tag.attrs.keys()): if k not in valid_tag[tag.name]: del tag.attrs[k] content_str = soup.decode() print(content_str)