介紹:
html
最近在學Python爬蟲,在這裏對數據解析模塊bs4作個學習筆記。ide
用途:學習
bs4用於解析xml文檔,而html只是xml的一種
spa
bs4 官方文檔地址:
orm
https://www.crummy.com/software/BeautifulSoup/bs4/doc/xml
學習筆記:htm
from bs4 import BeautifulSoup對象
html_doc = """three
<html><head><title>The Dormouse's story</title></head>ci
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class=... ... ... ... ... ... "sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'html.parser') #建立一個BeautifulSoup對象,添加html文件解析器,在不一樣平臺可能不一樣,在Linux上就不須要
print(soup.prettify()) #美化輸出
print(soup.get_text()) #將html_doc變量中保存的所有內容輸出(Linux系統會以\n隔開)
print('')
print(type(soup.title))
print(dir(soup.title))
print(soup.title) #獲取html標題
<title>The Dormouse's story</title>
print(soup.title.text) #獲取html標題內容
"The Dormouse's story"
print(soup.a) #獲取a標籤(第一個)
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.a.attrs) #獲取第一個a標籤的全部屬性,組成一個字典
{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
print(soup.a.attrs['href']) #獲取第一個a標籤的href屬性
'http://example.com/elsie'
print(soup.a.has_attr('class')) #判斷class屬性是否存在
True
print(soup.p) #獲取p標籤(第一個)
<p class="title"><b>The Dormouse's story</b></p>
print(soup.p.children) #獲取第一個p標籤下的全部子節點
<list_iterator object at 0x7fe8185261d0>
print(list(soup.p.children))
[<b>The Dormouse's story</b>]
print(list(soup.p.children)[0])
<b>The Dormouse's story</b>
print(list(soup.p.children)[0].text)
"The Dormouse's story"
print(soup.find_all('a')) #獲取全部的a標籤
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id=a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
for a in soup.find_all('a'): #遍歷全部的a標籤
print(a.attrs['href'])
print(soup.find(id='link3')) #獲取id=link3的標籤
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print('#'*150)
#支持CSS選擇器
#查找類名爲story的節點
print(soup.select('.story'))
print('')
print(soup.select('.story a'))
print('')
#查找id=link1的節點
print(soup.select('#link1'))