1、BeautifulSoup安裝html
pip install beautifulsoup4
2、使用示例spa
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story總共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """
soup = BeautifulSoup(html_doc, features="lxml")
1.name標籤名稱code
tag1 = soup.find('a')
print(tag1) #打印第一個a標籤內容
name = tag1.name #獲取
print(name)
tag1.name = 'span' #設置標籤爲span
print(soup) #打印內容
2.attr標籤屬性orm
tag2 = soup.find('a') attrs = tag2.attrs #獲取第一個a標籤全部屬性值 print(attrs) link1 = soup.find_all('a',attrs={'id':'link1'}) #獲取全部a標籤中,屬性有'id':'link1'的內容 print(link1) tag2.attrs = {'ik':123} #設置attrs值 print(tag2.attrs) tag2.attrs['id'] = 'xxxx' #設置 print(tag2.attrs) tag2.attrs['id'] = 'qqq' #設置 print(tag2.attrs)
3.find與find_all查找區別xml
#find匹配是第一個標籤 tag3 = soup.find('a') print(tag3) #find_al是查找全部標籤 tag4 = soup.find_all('a') print(tag4)
4.clear,將標籤的全部子標籤所有清空(保留標籤名)htm
tag5 = soup.find('body') tag5.clear() print(soup)
5.has_attr,檢查標籤是否具備該屬性blog
tag6 = soup.find('a') v = tag6.has_attr('id') print(v)
6.get_text,獲取標籤內部文本內容遞歸
tag7 = soup.find('a') v = tag7.get_text('id') print(v)
7.decompose,遞歸的刪除全部的標籤three
body = soup.find('body') body.decompose() print(soup)
8.extract,遞歸的刪除全部的標籤,並獲取刪除的標籤ip
body = soup.find('body') body.extract() print(soup)
9.decode,轉換爲字符串(含當前標籤);decode_contents(不含當前標籤)
body = soup.find('body') # v = body.decode() v = body.decode_contents() print(v)
10.encode,轉換爲字節(含當前標籤);encode_contents(不含當前標籤)
body = soup.find('body') # v = body.encode() v = body.encode_contents() print(v)
11.標籤的內容
tag8 = soup.find('span') print(tag8.string) #獲取內容 print(tag8) tag8.string = 'new content' #設置新內容 print(tag8) tag9 = soup.find('body') v = tag9.stripped_strings # 遞歸內部獲取全部標籤的文本 print(v) print(next(v))
12.children,全部子標籤
body = soup.find('body') v = body.children
13.children,全部子子孫孫標籤
body = soup.find('body') v = body.descendants
多餘的能夠查找官方文檔:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html