Python -- BeautifulSoup的學習使用

時間 2019-11-08

標籤 python beautifulsoup 學習使用欄目 Python 简体版

原文原文鏈接

BeautifulSoup4.3 的使用css

下載和安裝html

# 下載
http://www.crummy.com/software/BeautifulSoup/bs4/download/  # 解壓後 使用root執行 # python setup.py install # 最後 在python中測試是否成功 >>> import bs4

簡單使用:python

供練習的 Html Document正則表達式

html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """

>>> from bs4 import BeautifulSoup >>> soup = BeautifulSoup(html_doc) # 知識點1： 打印漂亮的html soup.prettify() >>> print(soup.prettify()) <html> <head> <title> The Dormouse's story </title> </p> </body> </html> # 知識點2 解析獲取html標籤 >>> soup.title <title>The Dormouse's story</title> >>> soup.title.name 'title' >>> soup.title.string u"The Dormouse's story" >>> soup.title.parent.name 'head' >>> soup.p <p class="title"><b>The Dormouse's story</b></p> >>> soup.p['class'] ['title'] >>> soup.a <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> >>> soup.find_all('a') [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] >>> soup.find(id='link3') <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> 總結: >>> soup.title # 獲取第一個title標籤 >>> soup.title.name # 獲取標籤名 --> title >>> soup.title.string # 獲取標籤內容 >>> soup.title.parent.name # 獲取title標籤的父標籤 >>> soup.p['class'] # 獲取第一個p標籤的class屬性的值 >>> soup.find_all('a') #獲取全部a標籤 >>> soup.find(id='link3') #獲取第一個id的值爲link3的標籤 # 知識點3 獲取全部超連接 >>> for link in soup.find_all('a'): ... print(link.get('href')) ... http://example.com/elsie http://example.com/lacie http://example.com/tillie # 知識點4 獲取全部文本 >>> print(soup.get_text()) The Dormouse's story Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

BeautifulSoup的四大對象 Tag, NavigableString, BeautifulSoup, and Comment.express

Tag

>>> soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') >>> tag = soup.b >>> type(tag) <class 'bs4.element.Tag'> >>> tag.name 'b' >>> tag.name = 'blockquote' >>> tag <blockquote class="boldest">Extremely bold</blockquote> >>> tag['class'] ['boldest'] >>> tag.attrs {'class': ['boldest']} >>> tag['class'] = 'verybold' >>> tag['id'] = 1 >>> tag <blockquote class="verybold" id="1">Extremely bold</blockquote> >>> del tag['class'] >>> del tag['id'] >>> tag <blockquote>Extremely bold</blockquote> >>> print(tag.get('class')) None >>> css_soup = BeautifulSoup('<p class="body strikeout"></p>') >>> css_soup.p['class'] ['body', 'strikeout'] >>> css_soup = BeautifulSoup('<p class="body"></p>') >>> css_soup.p['class'] ['body'] >>> id_soup = BeautifulSoup('<p id="my id"></p>') >>> id_soup.p['id'] 'my id' >>> rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a>') >>> rel_soup.a['rel'] ['index'] >>> rel_soup.a['rel'] = ['index', 'contents'] >>> print(rel_soup.p) <p>Back to the <a rel="index contents">homepage</a></p> 總結: tag['class'] # 獲取tag標籤class屬性的值 tag.attrs # 獲取tag標籤全部屬性 del tag['class'] # 刪除tag標籤的class屬性  關於多值屬性 >>> css_soup = BeautifulSoup('<p class="body strikeout"></p>') >>> css_soup.p['class'] ['body', 'strikeout'] >>> id_soup = BeautifulSoup('<p id="my id"></p>') >>> id_soup.p['id'] 'my id' 總結: BeatifulSoup對於容許多值的屬性 返回list， 對於不是多值的屬性， 就只放回str

NavigableString -- 和String差很少

>>> tag.string
u'Extremely bold' >>> type(tag.string) <class 'bs4.element.NavigableString'> >>> unicode_string = unicode(tag.string) >>> unicode_string u'Extremely bold' >>> type(unicode_string) <type 'unicode'> >>> tag.string.replace_with('No loger bold') u'Extremely bold' >>> tag <blockquote>No loger bold</blockquote> 總結: 1. NavigableString能夠轉換爲unicode 2. 若是想替換NavigableString的值， 使用 replace_with()方法

BeautifulSoup對象 -- 整個Html Document對象

>>> soup.name
u'[document]' >>> soup <html><body><blockquote>No loger bold</blockquote></body></html> >>> type(soup) <class 'bs4.BeautifulSoup'>

Comments and other special strings

markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" soup = BeautifulSoup(markup) comment = soup.b.string type(comment) # <class 'bs4.element.Comment'> print(soup.b.prettify()) # <b> # <!--Hey, buddy. Want to buy a used parser?--> # </b>

解析HTML函數

準備測試

html_doc = """ <html><head><title>The Dormouse's story</title></head> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc)

最簡單 -- 使用標籤名spa

>>> soup.head
<head><title>The Dormouse's story</title></head> >>> soup.body.b <b>The Dormouse's story</b> >>> soup.find_all('a') [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] 總結: 1. soup.head # 獲取第一個head標籤 2. soup.body.b # 獲取第一個body下第一個b標籤 3. soup.find_all('a') # 獲取全部a標籤

.contents and .childrencode

html_doc = ''' <html> <body> <a> href1 </a> <a> href2 </a> </body> </html> ''' # 獲取子標籤的方法1 -- 使用 .contents 用contents[0], contents[1]訪問 >>> soup2 = BeautifulSoup(html_doc) >>> contents = soup2.body.contents >>> contents[0] <a>href1</a> >>> contents[1] <a>href2</a> # 方法2 -- 使用 .children 用於遍歷 >>> for child in soup2.body.children: ... print(child) ... <a>href1</a> <a>href2</a>

.descendants

.children 和 .contents只能獲取直接後代  而 .descendants能夠得到全部後代
>>> head_tag.contents [<title>The Dormouse's story</title>] >>> for child in head_tag.descendants: ... print(child) ... <title>The Dormouse's story</title> The Dormouse's story >>> head_tag <head><title>The Dormouse's story</title></head> >>> >>> len(list(soup.children)) 1 >>> len(list(soup.descendants)) 25 >>>

.string

>>> title_tag
<title>The Dormouse's story</title> >>> title_tag.string u"The Dormouse's story" >>> print(soup.html.string) None 總結： 1. 若是一個標籤下面沒有其餘標籤， 那麼.string就是他的值 2. 若是一個標籤下面還有子標籤， 那麼.string爲 None

.strings and .stripped_stringsorm

>>> for string in soup.strings: ... print(repr(string)) ... u"The Dormouse's story" u'\n' u"The Dormouse's story" u'\n' u'Once upon a time there were three little sisters; and their names were\n' u'Elsie' u',\n' u'Lacie' u' and\n' u'Tillie' u';\nand they lived at the bottom of a well.' u'\n' u'...' u'\n' >>> for string in soup.stripped_strings: ... print(repr(string)) ... u"The Dormouse's story" u"The Dormouse's story" u'Once upon a time there were three little sisters; and their names were' u'Elsie' u',' u'Lacie' u'and' u'Tillie' u';\nand they lived at the bottom of a well.' u'...' 總結: 1. .strings 獲取一個標籤下面全部的string 2. .stripped_strings: 忽略 值爲'\n'的string 3. 關於 repr --> 講object轉換成 string

.parent

# 例子1 >>> title_tag = soup.title >>> title_tag <title>The Dormouse's story</title> >>> title_tag.parent <head><title>The Dormouse's story</title></head> # 例子2 >>> title_tag.string.parent <title>The Dormouse's story</title> # 例子3 >>> html_tag = soup.html >>> type(html_tag.parent) <class 'bs4.BeautifulSoup'> # 例子4 >>> print(soup.parent) None 總結: 1. html標籤的父標籤是 BeautifulSoup對象 2. BeautifulSoup沒有父標籤 （根節點）

.parents

>>> link = soup.a
>>> link <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> >>> for parent in link.parents: ... if parent is None: ... print(parent) ... else: ... print(parent.name) ... p body html

兄弟節點

預備解析HTML

>>> sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>") >>> print(sibling_soup.prettify()) <html> <body> <a> <b> text1 </b> <c> text2 </c> </a> </body> </html>

.next_sibling and .previous_sibling

# 例子1 對照上面的prettify()輸出 >>> sibling_soup.b.next_sibling <c>text2</c> >>> sibling_soup.c.previous_sibling <b>text1</b> # 例子2 看prettify()的輸出， 能夠看到 b標籤上面沒有兄弟標籤 c標籤下面也沒有兄弟標籤 所以輸出是None >>> print(sibling_soup.b.previous_sibling) None >>> print(sibling_soup.c.next_sibling) None # 例子3 注意點: text1沒有兄弟節點 由於它和text2不是同一個父親！ >>> sibling_soup.b.string u'text1' >>> sibling_soup.b.string.next_sibling None # 例子4 注意點： 第一個a標籤的下一個兄弟節點是 '\n', 而不是 下一個<a>標籤！（若是沒有排版就不會） 先看全部的a標籤 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> >>> link = soup.a >>> link <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> >>> link.next_sibling u',\n' >>> link.next_sibling.next_sibling <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> # 例子5： 驗證上面的說法 -- 沒有排版的話， a標籤的下一個標籤就不是 「\n‘ >>> html_doc = '<a href="link1"></a><a href="link2"></a>' >>> soup2 = BeautifulSoup(html_doc) >>> link = soup2.a >>> link <a href="link1"></a> >>> link.next_sibling <a href="link2"></a>

.next_siblings and .previous_siblings

# 例子1 >>> for sibling in soup.a.next_siblings: ... print(repr(sibling)) ... u',\n' <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> u' and\n' <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> u';\nand they lived at the bottom of a well.' # 例子2 >>> for sibling in soup.find(id='link3').previous_siblings: ... print(repr(sibling)) ... u' and\n' <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> u',\n' <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> u'Once upon a time there were three little sisters; and their names were\n'

.next_element and .previous_element

預備知識:
<html><head><title>The Dormouse's story</title></head></html> HTML解析器如何解析這段？ 打開html標籤， 打開head標籤， 打開title標籤， 保存 'The Dormouse's stroy'這個string. 關閉 title標籤， 關閉 head標籤， 關閉html標籤 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> # 例子 >>> second_a_tag = soup.find('a', id='link2') >>> second_a_tag <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> >>> second_a_tag.next_sibling u' and\n' >>> second_a_tag.next_element u'Lacie' 總結: HTML解析器讀取到<a id='link2'>處， 因此下一個元素是 Lacie, 再下一個元素是 u' and\n' (注： 結束標籤不算在這裏面)

.next_elements and .previous_elements

>>> last_a_tag = soup.find('a', id='link3') >>> for element in last_a_tag.next_elements: ... print(repr(element)) ... u'Tillie' u';\nand they lived at the bottom of a well.' u'\n' <p class="story">...</p> u'...' u'\n'

find() and find_all()

find_all() 的簡單使用 Signature: find_all(name, attrs, recursive, text, limit, **kwargs)

# 例子1. 搜索全部title標籤 >>> soup.find_all('title') [<title>The Dormouse's story</title>] # 例子2. 搜索全部class爲title 的 p標籤 >>> soup.find_all('p', 'title') [<p class="title"><b>The Dormouse's story</b></p>] # 例子3. 搜索全部a標籤 >>> soup.find_all('a') [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] # 例子4. 搜索全部 id爲link2的標籤 >>> soup.find_all(id='link2') [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] # 例子5. 搜索text中帶有 sisters的標籤 >>> import re >>> soup.find(text=re.compile('sisters')) u'Once upon a time there were three little sisters; and their names were\n'

使用函數做爲參數

>>> def has_class_but_no_id(tag): ... return tag.has_attr('class') and not tag.has_attr('id') ... >>> soup.find_all(has_class_but_no_id) [<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>, <p class="story">...</p>]

find_all()的進階使用

# 例子1. 搜索全部有id屬性的標籤 >>> soup.find_all(id=True) [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] # 例子2. 搜索href中帶有 elsie 而且 id的值爲link1的標籤 >>> soup.find_all(href=re.compile('elsie'), id='link1') [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] # 例子3. 對於特殊的屬性名 >>> data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') >>> data_soup.find_all(data-foo='value') File "<stdin>", line 1 SyntaxError: keyword can't be an expression  這樣是不行的 使用 attrs={} >>> data_soup.find_all(attrs={'data-foo': 'value'}) [<div data-foo="value">foo!</div>]

Searching by CSS class

注意： class是Python的保留字， 因此使用的時候， 用 class_替代 （class的最後躲一下劃線）
# 例子1 搜索全部class爲sister的a標籤 >>> soup.find_all('a', class_='sister') [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] # 例子2 搜索class中帶 itl的標籤 >>> soup.find_all(class_=re.compile('itl')) [<p class="title"><b>The Dormouse's story</b></p>] # 例子3 使用函數做爲參數 若是返回結果爲True, 則matches這個標籤 >>> def has_six_characters(css_class): ... return css_class is not None and len(css_class) == 6 ... >>> soup.find_all(class_=has_six_characters) [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] # 例子4 一個標籤能夠有多個值的屬性 好比 class >>> css_soup = BeautifulSoup('<p class="body strikeout"></p>') >>> css_soup.find_all('p', class_='strikeout') [<p class="body strikeout"></p>] >>> css_soup.find_all('p', class_='body') [<p class="body strikeout"></p>] 注： 對於有多個值的屬性， 咱們能夠經過其中的一個值搜索到它們 # 例子5 不過 若是一塊兒搜索 順序不能顛倒 >>> css_soup.find_all('p', class_='strikeout body') [] # 例子6 咱們能夠經過CSS selector選擇咱們要的標籤 >>> css_soup.select('p.strikeout.body') [<p class="body strikeout"></p>] # 例子7 對於不支持 class_的早期版本， 使用 attrs={} >>> soup.find_all('a', attrs={'class': 'sister'}) [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

The text argument

With text you can search for strings instead of tags. As with name and the keyword arguments, you can pass in a string, a regular expression, a list, a function, or the value True. Here are some examples: # 例子1. 使用string做爲參數 >>> soup.find_all(text='Elsie') [u'Elsie'] # 例子2. 使用List做爲參數 >>> soup.find_all(text=['Tillie', 'Elsie', 'Lacie']) [u'Elsie', u'Lacie', u'Tillie'] # 例子3. 使用正則表達式做爲參數 >>> soup.find_all(text=re.compile('Dormouse')) [u"The Dormouse's story", u"The Dormouse's story"] # 例子4. 使用函數做爲參數 >>> def is_the_only_string_within_a_tag(s): ... return (s == s.parent.string) ... >>> soup.find_all(text=is_the_only_string_within_a_tag) [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] # 例子5. 聯合其餘參數一塊兒搜索 >>> soup.find_all('a', text='Elsie') [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

The limit arguement

# 若是HTML文件不少， 解析速度就慢 這個時候 能夠指定BeautifulSoup搜索的個數 # 例子: 只搜索符合條件的前兩個結果 >>> soup.find_all('a', limit=2) [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

The recursive argument

解析的HTML
<html>
 <head>
  <title> The Dormouse's story </title> </head> # 例子：title是head下面的， 而不是html下面的（直接） 若是關閉遞歸， 就找不到title了。 也就是說， 開啓遞歸， 不只搜索兒子， 還搜索孫子。 若是關閉遞歸， 就只搜索兒子 >>> soup.html.find_all('title') [<title>The Dormouse's story</title>] >>> soup.html.find_all('title', recursive=False) []

Calling a tag is like calling find_all()

tag標籤也可使用find_all(), 像BeautifulSoup對象同樣

# 下面這兩個是相等的 soup.title.find_all(text=True) soup.title(text=True)

find()

Signature: find(name, attrs, recursive, text, **kwargs)

find()的簡單使用

# 例子1  這兩個是等價 不過find_all返回的是全部結果中的前1個結果  而 find只是返回一個結果   find_all會搜索全部的文檔 速度較慢 
>>> soup.find_all('title', limit=1)
[<title>The Dormouse's story</title>]
>>> soup.find('title')
<title>The Dormouse's story</title>


# 例子2 若是搜索不到相關的標籤， find返回的是None  而find_all返回的是 list
>>> print(soup.find('nosuchtag'))
None
>>> print(soup.find_all('nosuchtag'))
[]

# 例子3  這兩個是相等的
>>> soup.head.title
<title>The Dormouse's story</title>
>>> soup.find('head').find('title')
<title>The Dormouse's story</title>

find_parents() and find_parent()

Signature: find_parents(name, attrs, text, limit, **kwargs)

Signature: find_parent(name, attrs, text, **kwargs)

>>> a_string = soup.find(text='Lacie')
>>> a_string
u'Lacie'
>>> a_string.find_parents('a')
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
>>> a_string.find_parent('p')
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
>>> a_string.find_parents('p', class_='title')
[]

find_next_silbings() and find_next_sibling()

>>> first_link = soup.a
>>> first_link
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> first_link.find_next_siblings('a')
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> first_stroy_paragraph = soup.find('p', 'story')
>>> first_stroy_paragraph.find_next_sibling('p')
<p class="story">...</p>

find_previous_siblings() and find_previous_sibling()

Signature: find_previous_siblings(name, attrs, text, limit, **kwargs)

Signature: find_previous_sibling(name, attrs, text, **kwargs)

>>> last_link = soup.find('a', id='link3')
>>> last_link
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
>>> last_link.find_previous_siblings('a')
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
>>> first_story_paragraph = soup.find('p', 'story')
>>> first_story_paragraph.find_previous_sibling('p')
<p class="title"><b>The Dormouse's story</b></p>

find_all_next() and find_next()

Signature: find_all_next(name, attrs, text, limit, **kwargs)

Signature: find_next(name, attrs, text, **kwargs)

>>> first_link = soup.a
>>> first_link
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> first_link.find_all_next(text=True)
[u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', u';\nand they lived at the bottom of a well.', u'\n', u'...', u'\n']
>>> first_link.find_next('p')
<p class="story">...</p>

find_all_previous() and find_previous()

>>> first_link = soup.a
>>> first_link
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> first_link.find_all_previous('p')
[<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="title"><b>The Dormouse's story</b></p>]
>>> first_link.find_previous('title')
<title>The Dormouse's story</title>