Python 爬蟲之數據解析模塊bs4基礎

介紹:
html

最近在學Python爬蟲,在這裏對數據解析模塊bs4作個學習筆記。ide


用途:學習

bs4用於解析xml文檔,而html只是xml的一種
spa


bs4 官方文檔地址:
orm

https://www.crummy.com/software/BeautifulSoup/bs4/doc/xml


學習筆記:htm


from bs4 import BeautifulSoup對象


html_doc = """three

<html><head><title>The Dormouse's story</title></head>ci

<body>

<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class=... ... ... ... ... ... "sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>


<p class="story">...</p>

"""


soup = BeautifulSoup(html_doc,'html.parser')    #建立一個BeautifulSoup對象,添加html文件解析器,在不一樣平臺可能不一樣,在Linux上就不須要

print(soup.prettify())    #美化輸出

print(soup.get_text())    #將html_doc變量中保存的所有內容輸出(Linux系統會以\n隔開)

print('')


print(type(soup.title))

print(dir(soup.title))


print(soup.title)    #獲取html標題

    <title>The Dormouse's story</title>

print(soup.title.text)    #獲取html標題內容

    "The Dormouse's story"


print(soup.a)       #獲取a標籤(第一個)

    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

print(soup.a.attrs)   #獲取第一個a標籤的全部屬性,組成一個字典

    {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

print(soup.a.attrs['href'])    #獲取第一個a標籤的href屬性

    'http://example.com/elsie'

print(soup.a.has_attr('class'))     #判斷class屬性是否存在

    True


print(soup.p)    #獲取p標籤(第一個)

    <p class="title"><b>The Dormouse's story</b></p>

print(soup.p.children)    #獲取第一個p標籤下的全部子節點

    <list_iterator object at 0x7fe8185261d0>

print(list(soup.p.children))

    [<b>The Dormouse's story</b>]

print(list(soup.p.children)[0])

    <b>The Dormouse's story</b>

print(list(soup.p.children)[0].text)

    "The Dormouse's story"


print(soup.find_all('a'))    #獲取全部的a標籤

    [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id=a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

for a in soup.find_all('a'):   #遍歷全部的a標籤

    print(a.attrs['href'])


print(soup.find(id='link3'))    #獲取id=link3的標籤

    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

print('#'*150)


#支持CSS選擇器

#查找類名爲story的節點

print(soup.select('.story'))

print('')

print(soup.select('.story a'))

print('')

#查找id=link1的節點

print(soup.select('#link1'))

相關文章
相關標籤/搜索