BeautifulSoup模塊詳解

時間 2019-11-12

標籤 beautifulsoup 模塊詳解简体版

原文原文鏈接

BeautifulSoup是一個模塊，該模塊用於接收一個HTML或XML字符串，而後將其進行格式化，以後遍可使用他提供的方法進行快速查找指定元素，從而使得在HTML或XML中查找指定元素變得簡單。

官方文檔： http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

1.安裝導入

 
   pip install bs4 
  
   pip install lxml    # 解析模塊

 
   from bs4 import BeautifulSoup 
  

 
   soup = BeautifulSoup(html, features='lxml') 
  

2.基本使用

 
   # 找到第一個div標籤 
  
   soup.find(name='div') 
  
   # 找到全部的div標籤 
  
   soup.find_all(name='div') 
  
   # 找到id爲importdiv的標籤 
  
   soup.select('#importdiv')

3.具體使用

 
   ###### 1.name 標籤名稱 ###### 
  
   html = '<html><body><a>我是一個兵</a></body></html>' 
  
   soup = BeautifulSoup(html, 'lxml') 
  
   tag = soup.find('a') 
  
   print(tag)            # <a>我是一個兵</a> 
  
   print(tag.name)        # a 
  
   tag.name = 'div'     # 設置標籤 
  
   print(tag)            # <div>我是一個兵</div> 
  
   print(tag.name)        # div 
  
   ###### 2.attr 標籤屬性 ###### 
  
   html = '<html><body><a href="xxx" class="xxx">我是一個兵</a></body></html>' 
  
   soup = BeautifulSoup(html, 'lxml') 
  
   tag = soup.find('a') 
  
   # 獲取屬性 
  
   print(tag.attrs)    # {'href': 'xxx', 'class': ['xxx']} 
  
   # 設置屬性 
  
   tag.attrs = {'href': 'sss'} 
  
   print(tag.attrs)    # {'href': 'sss'} 
  
   ###### 3.children 全部子標籤 ###### 
  
   tag = soup.find('div')    # 找第一個div標籤 
  
   tag.children    # 找div下的全部子標籤 
  
   ###### 4.descendants 全部子子孫孫標籤 ###### 
  
   tag = soup.find('div')    # 找第一個div標籤 
  
   tag.descendants    # 找div下的全部的子子孫孫標籤 
  
   ###### 5.clear 清空子標籤 ###### 
  
   html = '<html><body><a href="xxx" class="xxx">我是一個兵</a></body></html>' 
  
   soup = BeautifulSoup(html, 'lxml') 
  
   tag = soup.find('body') 
  
   tag.clear()   # 將body下的全部內容及標籤清空 
  
   print(tag)    # <body></body> 
  
   ###### 6.decompose 遞歸刪除全部標籤 ###### 
  
   tag = soup.find('body') 
  
   tag.decompose() 
  
   print(tag)    # <None></None> 
  
   ###### 7.extract 遞歸刪除全部標籤，並獲取 ###### 
  
   html = '<html><body><a href="xxx" class="xxx">我是一個兵</a></body></html>' 
  
   soup = BeautifulSoup(html, 'lxml') 
  
   tag = soup.find('body')    # 遞歸刪除全部的標籤，並獲取刪除的標籤 
  
   delect_tags = tag.extract() 
  
   print(tag)    # <None></None> 
  
   print(delect_tags)    # <body><a class="xxx" href="xxx">我是一個兵</a></body> 
  
   ###### 8.decode和decode_contents ###### 
  
   # decode(): 轉換爲字符串（包含當前標籤） 
  
   # decode_contents(): 轉換爲字符串（不包含當前標籤） 
  
   print(tag.decode())    # <body><a class="xxx" href="xxx">我是一個兵</a></body> 
  
   print(tag.decode_contents())    # <a class="xxx" href="xxx">我是一個兵</a> 
  
   ###### 9.encode和encode_contents ###### 
  
   # encode(): 轉換爲字節（包含當前標籤） 
  
   # encode_contents(): 轉換爲字節（不包含當前標籤） 
  
   print(tag.encode())    # b'<body><a class="xxx" href="xxx">\xe6\x88\x91\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe5\x85\xb5</a></body>' 
  
   print(tag.encode_contents())    # b'<a class="xxx" href="xxx">\xe6\x88\x91\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe5\x85\xb5</a>' 
  
   ###### 10.find 獲取匹配的第一個標籤 ###### 
  
   html = '<html><body><a class="xxx">我是一個兵</a></body></html>' 
  
   soup = BeautifulSoup(html, 'lxml') 
  
   tag = soup.find(name='a', attrs={'class', 'xxx'}, text='我是一個兵', recursive=True) 
  
   ###### 11.find_all 獲取匹配的全部標籤 ###### 
  
   # 普通查找 
  
   soup.find_all('a') 
  
   # 限制查找條數 
  
   soup.find_all('a', limit=1) 
  
   # 使用列表來查找 
  
   soup.find_all(name=['a', 'img'])    # 查找標籤名爲a和img的，[<a class="xxx">我是一個兵</a>, <img/>, <a></a>] 
  
   soup.find_all(attrs={'class': ['xxx', 'yyy']})    # 查找屬性爲xxx和yyy的，[<a class="xxx">我是一個兵</a>, <img class="yyy"/>] 
  
   # 也可適用於text，id，href等等 
  
   # 使用正則來查找 
  
   pattern = re.compile('^x')    # 匹配以x開頭class的標籤 
  
   tag = soup.find_all(attrs={'class': pattern}) 
  
   # 正則能夠用於name；id；href等等 
  
   # 方法篩選 
  
   def foo(tag): 
  
       return tag.has_attr('class') and tag.has_attr('id') 
  
   soup.find_all(name=foo) 
  
   # get方法獲取屬性標籤 
  
   tag = soup.find('a') 
  
   print(tag.get('class'))    # ['xxx'] 
  
   ###### 12.has_attr 檢查標籤是否具備該屬性 ###### 
  
   tag = soup.find('a') 
  
   print(tag.has_attr('id'))    # False 
  
   ###### 13.get_text 獲取標籤文本內容 ###### 
  
   tag = soup.find('a') 
  
   print(tag.get_text())    # 我是一個兵 
  
   ###### 14.index 獲取標籤在某標籤中的索引位置 ###### 
  
   tag = soup.find('body') 
  
   print(tag.index(tag.find('a')))    # 0 
  
   tag = soup.find('body') 
  
   for k, v in enumerate(tag): 
  
       print(k, v)     # k爲索引；v爲標籤和內容 
  
   ###### 15.is_empty_element 是不是空標籤或者自閉合標籤 ###### 
  
   tag = soup.find('img') 
  
   print(tag.is_empty_element)   # True 
  
   ###### 16.當前標籤的關聯標籤 ###### 
  
   soup.next 
  
   soup.next_element 
  
   soup.next_elements 
  
   soup.next_sibling 
  
   soup.next_siblings 
  
   tag.previous 
  
   tag.previous_element 
  
   tag.previous_elements 
  
   tag.previous_sibling 
  
   tag.previous_siblings 
  
   tag.parent 
  
   tag.parents 
  
   ###### 17.查找某標籤的關聯標籤 ###### 
  
   tag.find_next(...) 
  
   tag.find_all_next(...) 
  
   tag.find_next_sibling(...) 
  
   tag.find_next_siblings(...) 
  
   tag.find_previous(...) 
  
   tag.find_all_previous(...) 
  
   tag.find_previous_sibling(...) 
  
   tag.find_previous_siblings(...) 
  
   tag.find_parent(...) 
  
   tag.find_parents(...) 
  
   # 參數同find_all 
  
   ###### 18.select, select_one, CSS選擇器 ###### 
  
   tag = soup.select('div > a.xxx')    # 選擇div下的class爲xxx的a標籤 
  
   tag = soup.select('div, a')    # 選擇div或a標籤 
  
   soup.select('a[href*=".com/el"]')    # 正則匹配 
  
   from bs4.element import Tag 
  
   def default_candidate_generator(tag): 
  
       for child in tag.descendants: 
  
           if not isinstance(child, Tag): 
  
               continue 
  
           if not child.has_attr('href'): 
  
               continue 
  
           yield child 
  
   tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) 
  
   print(type(tags), tags) 
  
   ###### 19.更改標籤的內容 ###### 
  
   html = '<html><body><a class="xxx">我是一個兵</a></body></html>' 
  
   soup = BeautifulSoup(html, 'lxml') 
  
   tag = soup.find('a') 
  
   print(tag.string)    # 獲取文本內容:我是一個兵 
  
   tag.string = '我不是一個兵'    # 設置值 
  
   print(soup)    # <html><body><a class="xxx">我不是一個兵</a></body></html> 
  
   tag = soup.find('body') 
  
   # 遞歸內部獲取全部標籤的文本 
  
   print(tag.stripped_strings  )    # <generator object stripped_strings at 0x0137FC60> 
  
   ###### 20.append 在當前標籤內部追加一個標籤 ###### 
  
   tag = soup.find('a') 
  
   tag.append(soup.find('img')) 
  
   print(soup) # <html><body><a class="xxx">我是一個兵<img/></a></body></html> 
  
   tag = soup.find('a') 
  
   from bs4.element import Tag 
  
   obj = Tag(name='span', attrs={'id': 'id_1'}) 
  
   obj.string = 'span標籤' 
  
   tag.append(obj) 
  
   print(soup)    # <html><body><a class="xxx">我是一個兵<span id="id_1">span標籤</span></a></body></html> 
  
   ###### 21.insert 在當前標籤內部指定位置插入一個標籤 ###### 
  
   obj = soup.find('img') 
  
   tag = soup.find('a') 
  
   tag.insert(0, obj)  # 插入最前邊 
  
   print(soup)    # <html><body><a class="xxx"><img/>我是一個兵</a></body></html> 
  
   ###### 22.insert_after, insert_before 在當前標籤前面或後面插入 ###### 
  
   obj = soup.find('img') 
  
   tag = soup.find('a') 
  
   tag.insert_after(obj) 
  
   tag.insert_before(obj) 
  
   ###### 23.replace_with 當前標籤替換爲指定標籤 ###### 
  
   obj = soup.find('img') 
  
   tag = soup.find('a') 
  
   tag.replace_with(obj) 
  
   ###### 24.setup 建立標籤之間的關係 ###### 
  
   obj = soup.find('img') 
  
   tag = soup.find('a') 
  
   tag.setup(next=a) 
  
   print(tag.next) 
  
   ###### 25.wrap 用指定標籤把當前標籤包裹起來 ###### 
  
   obj = soup.find('img') 
  
   tag = soup.find('a') 
  
   tag.wrap(obj) 
  
   tag = soup.find('a') 
  
   tag.wrap(soup.find('img')) 
  
   ###### 26.unwrap 去掉當前標籤，將保留其包裹的標籤 ###### 
  
   tag = soup.find('a') 
  
   tag.unwrap()

1. BeautifulSoup模塊函數詳解
2. BeautifulSoup解析模塊
3. 03 解析庫之Beautifulsoup模塊 Beautifulsoup模塊
4. 解析庫-beautifulsoup模塊
5. BeautifulSoup模塊
6. Beautifulsoup模塊
7. python3 BeautifulSoup模塊
8. python模塊--Beautifulsoup
9. requsets模塊和beautifulsoup模塊
10. 【一塊兒學爬蟲】BeautifulSoup庫詳解
更多相關文章...
• Lua 模塊與包 - Lua 教程
• DTD - XML 構建模塊 - DTD 教程
• 委託模式
• Flink 數據傳輸及反壓詳解

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。