學習自http://rsj217.diandian.com/post/2012-11-01/40041235132html
#/usr/bin/env python # -*- coding:utf-8 -*- import urllib2 import sys import re from bs4 import BeautifulSoup # HTML from bs4 import BeautifulStoneSoup # XML import bs4 # ALL doc = ['<html><head><title>Page title</title></head>','<body><p id="firstpara" align="center">This is paragraph <b>one</b>.','<p id="secondpara" align="blah">This is paragraph <b>two</b>.','</html>'] # BeautifulSoup 接受一個字符串參數 soup = BeautifulSoup(''.join(doc)) print type(soup) print type(soup.html) print type(soup.title.string) #BeautifulSoup文檔樹有三種基本對象 print #BeautifulSoup對象 html = soup.html print type(html) print html print #BeautifulSoup.Tag title = soup.title print type(title) print title print #BeautifulSoup.NavigableString contents = soup.contents print type(contents) print contents print #使用contents方法查看文檔樹層級結構 print len(soup.contents[0].contents) print soup.contents[0].contents[0] print soup.contents[0].contents[1] print len(soup.contents[0].contents[0]) print soup.contents[0].contents[0].contents[0] print soup.contents[0].contents[0].contents[0].contents[0] #獲取樹的子代元素,相似深度遍歷 print head = html.next print type(head) print head print title = head.next print type(title) print title print title_content = title.next print type(title_content) print title_content print body = title_content.next print type(body) print body #使用replacewith方法替換對象 print print head print head.parent head.replaceWith('head was replace') print head.parent #輸出空,由於原數據保留並被剪除 print head #沒有改變正常輸出 print soup.head #輸出空,head對象已不存在 print soup #文檔對象已經被修改 print #使用find,findAll方法進行搜索 print soup.findAll('p') print print soup.findAll('p',id='firstpara') print #傳一個屬性或多個屬性對 print soup.findAll('p',{'align':'blah'}) #使用正則表達式 print soup.findAll(id=re.compile("para$")) #讀取和修改屬性 print p1 = soup.p print p1 print p1['id'] p1['id'] = 'changeid' print p1 #已被修改 print soup #文檔對象已經被修改