#HTML文檔《==》標籤樹《==》BeautifulSoup類 from bs4 import BeautifulSoup soup=BeautifulSoup("<html>data</html>","html.parser") #「html.parser」是beautiflSoup庫解析器 soup2=BeautifulSoup(open("D://demo.html"),"html.parser")
<html><head><title>This is a python demo page</title></head> <body> <p class="title"><b>The demo python introduces several python courses.</b></p> <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p> </body></html>
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.title) #<title>This is a python demo page</title> print(type(soup.title)) #<class 'bs4.element.Tag'>
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.title.name) #title print(type(soup.title.name)) #<class 'str'>
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.a) #<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> print(soup.a.attrs) #{'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'} print(type(soup.a.attrs)) #<class 'dict'> print(soup.a.attrs['href']) #http://www.icourse163.org/course/BIT-268001
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.p) #<p class="title"><b>The demo python introduces several python courses.</b></p> print(soup.p.string) #跨域獲取,The demo python introduces several python courses. print(type(soup.p.string)) #<class 'bs4.element.NavigableString'>
demo="<b><!--The demo python introduces several python courses. --></b>" \ "<p>The demo python introduces several python courses.</p>" soup=BeautifulSoup(demo,"html.parser") print(soup.b) #<b><!--The demo python introduces several python courses. --></b> print(soup.b.string) #The demo python introduces several python courses. print(type(soup.b.string)) #<class 'bs4.element.Comment'> print(soup.p) #<p>The demo python introduces several python courses./p> print(soup.p.string) #The demo python introduces several python courses. print(type(soup.p.string)) #<class 'bs4.element.NavigableString'> #使用.string方法獲取註釋和非屬性字符串內容獲得的結果是相同的 #b和p標籤的例子說明,能夠利用標籤內容的類型來區分註釋和非屬性字符串
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.head) #<head><title>This is a python demo page</title></head> print(soup.head.contents) #[<title>This is a python demo page</title>]
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") for child in soup.body.children: print(child) #孩子中包括換行符、字符串 ''' <p class="title"><b>The demo python introduces several python courses.</b></p> <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p> '''
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") for tag in soup.body.descendants: print(tag) ''' <p class="title"><b>The demo python introduces several python courses.</b></p> <b>The demo python introduces several python courses.</b> The demo python introduces several python courses. <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p> Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> Basic Python and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a> Advanced Python . '''
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.a.parent) ''' <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p> ''' print(soup.parent) #None
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.parent) #soup的父節點爲空,但在遍歷時不須要進行非空判斷,parents函數內部幫咱們進行了處理 for parent in soup.a.parents: print(parent.name) ''' None p body html [document] '''
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.a.previous_sibling) #Python is a wonderful general-purpose programming language. # You can learn Python from novice to professional by tracking the following courses: print(soup.a.next_sibling) # and
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") for sibling in soup.a.previous_siblings: print(sibling.name) print("===================") for sibling in soup.a.next_siblings: print(sibling.name) ''' None =================== None a None '''
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.prettify()) ''' <html> <head> <title> This is a python demo page </title> </head> <body> <p class="title"> <b> The demo python introduces several python courses. </b> </p> <p class="course"> Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1"> Basic Python </a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2"> Advanced Python </a> . </p> </body> </html> '''
bs4庫將任何HTML輸入都變成urf-8編碼,python 3.x默認支持編碼是utf-8,解析無障礙。
(一) XML
(二) jSON
(三) YAML
(四) 三種信息標記形式的比較
方法一:完整解析信息的標記形式,再提取關鍵信息。這種方法須要標記解析器,例如:bs4庫 的標籤樹遍歷。
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") for link in soup.find_all('a'): print(link.get('href')) #http://www.icourse163.org/course/BIT-268001 # http://www.icourse163.org/course/BIT-1001870001
<>.find_all(name, attrs, recursive, string, **kwargs),返回一個列表類型,存儲查找結果
注:<tag>(..) 《==》 <tag>.find_all(..)
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.find_all("a")) print(soup.find_all(["a","b"])) ''' [<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>] [<b>The demo python introduces several python courses.</b>, <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>] '''
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.find_all('p',attrs='course')) print("====================") print(soup.find_all(attrs={'id':'link1'})) ''' <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>] ==================== [<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>] '''
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.find_all('a',recursive=False)) #[]
import requests from bs4 import BeautifulSoup r=requests.get("http://python123.io/ws/demo.html") demo=r.text soup=BeautifulSoup(demo,"html.parser") print(soup.find_all(string="Basic Python")) #['Basic Python']
<tbody class="hidden_zhpm" style="text-align: center;"> <tr class="alt"> <td>1</td> <td> <div align="left">清華大學</div> </td> <td>北京</td> <td>95.3</td> <td class="hidden-xs need-hidden indicator5">100.0</td> <td class="hidden-xs need-hidden indicator6" style="display: none;">97.50%</td> <td class="hidden-xs need-hidden indicator7" style="display: none;">1182145</td> <td class="hidden-xs need-hidden indicator8" style="display: none;">44730</td> <td class="hidden-xs need-hidden indicator9" style="display: none;">1.447</td> <td class="hidden-xs need-hidden indicator10" style="display: none;">1556</td> <td class="hidden-xs need-hidden indicator11" style="display: none;">121</td> <td class="hidden-xs need-hidden indicator12" style="display: none;">1586283</td> <td class="hidden-xs need-hidden indicator13" style="display: none;">500525</td> <td class="hidden-xs need-hidden indicator14" style="display: none;">6.90%</td> </tr> <tr class="alt"> <td>2</td> <td> <div align="left">北京大學</div> </td> <td>北京</td> <td>78.6</td> <td class="hidden-xs need-hidden indicator5">96.4</td> <td class="hidden-xs need-hidden indicator6" style="display: none;">97.39%</td> <td class="hidden-xs need-hidden indicator7" style="display: none;">665616</td> <td class="hidden-xs need-hidden indicator8" style="display: none;">43731</td> <td class="hidden-xs need-hidden indicator9" style="display: none;">1.374</td> <td class="hidden-xs need-hidden indicator10" style="display: none;">1278</td> <td class="hidden-xs need-hidden indicator11" style="display: none;">94</td> <td class="hidden-xs need-hidden indicator12" style="display: none;">480918</td> <td class="hidden-xs need-hidden indicator13" style="display: none;">4110</td> <td class="hidden-xs need-hidden indicator14" style="display: none;">6.01%</td> </tr> <tr class="alt"> <td>3</td> <td> <div align="left">浙江大學</div> </td> <td>浙江</td> <td>73.9</td> <td class="hidden-xs need-hidden indicator5">86.3</td> <td class="hidden-xs need-hidden indicator6" style="display: none;">96.56%</td> <td class="hidden-xs need-hidden indicator7" style="display: none;">452414</td> <td class="hidden-xs need-hidden indicator8" style="display: none;">47915</td> <td class="hidden-xs need-hidden indicator9" style="display: none;">1.131</td> <td class="hidden-xs need-hidden indicator10" style="display: none;">939</td> <td class="hidden-xs need-hidden indicator11" style="display: none;">91</td> <td class="hidden-xs need-hidden indicator12" style="display: none;">1266561</td> <td class="hidden-xs need-hidden indicator13" style="display: none;">27720</td> <td class="hidden-xs need-hidden indicator14" style="display: none;">5.18%</td> </tr> <tr class="alt"> <td>4</td> <td> <div align="left">上海交通大學</div> </td> <td>上海</td> <td>73.1</td> <td class="hidden-xs need-hidden indicator5">90.5</td> <td class="hidden-xs need-hidden indicator6" style="display: none;">98.65%</td> <td class="hidden-xs need-hidden indicator7" style="display: none;">226279</td> <td class="hidden-xs need-hidden indicator8" style="display: none;">49749</td> <td class="hidden-xs need-hidden indicator9" style="display: none;">1.176</td> <td class="hidden-xs need-hidden indicator10" style="display: none;">960</td> <td class="hidden-xs need-hidden indicator11" style="display: none;">79</td> <td class="hidden-xs need-hidden indicator12" style="display: none;">742538</td> <td class="hidden-xs need-hidden indicator13" style="display: none;">15264</td> <td class="hidden-xs need-hidden indicator14" style="display: none;">7.33%</td> </tr> <!-- 後面的內容格式徹底相同-->
#爬取中國最好大學排名網的學校學校排名數據 import requests,bs4 from bs4 import BeautifulSoup def getHTMLText(url): ''' 爬取網頁 :param url: url :return: text ''' try: r=requests.get(url) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return "" def fillUnivList(uList,demo): ''' 解析網頁並將數據封裝在列表中 :param uList: [] :param demo: html :return: [] ''' soup=BeautifulSoup(demo,"html.parser") for tr in soup.find('tbody', attrs={'class': "hidden_zhpm"}).children: #soup.tbody.children也可 if isinstance(tr,(bs4.element.Tag,)): tds=tr.find_all('td') uList.append([tds[0].string,tds[1].contents[0].string,tds[2].string,tds[3].string]) def printUnivList(uList,num): ''' 輸出列表 :param uList: [] :param num: count :return: ''' tplt = "{0:^10s}\t\t{1:{4}^10s}\t\t{2:{5}^10s}\t\t{3:^10s}" print(tplt.format("排名", "學校名稱","省份","總分",chr(12288),chr(12288))) for i in range(num): u = uList[i] print(tplt.format(u[0], u[1],u[2],u[3],chr(12288),chr(12288))) # print("排名\t\t學校名稱\t\t省份\t\t總分") # for i in range(num): # u = uList[i] # print(u[0],"\t\t",u[1],"\t\t",u[2],"\t\t",u[3]) def inputCount(): ''' :return:university count ''' countUiv = input("輸入您但願獲得的排名數量[max:600]:") if not countUiv: print("輸入不能爲空!") else: if countUiv.isdigit(): countUiv = int(countUiv) if countUiv >= 600: print("數量超過600無效!") else: return countUiv else: print("請輸入數字!") if __name__ == '__main__': url="http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html" uList=[] #存放大學排名信息 countUniv=inputCount() demo=getHTMLText(url) fillUnivList(uList,demo) printUnivList(uList,countUniv)
排名 學校名稱 省份 總分
1 清華大學 北京 95.3
2 北京大學 北京 78.6
3 浙江大學 浙江 73.9
4 上海交通大學 上海 73.1
5 復旦大學 上海 66.0
6 中國科學技術大學 安徽 61.9
7 南京大學 江蘇 59.8
8 華中科技大學 湖北 59.1
9 中山大學 廣東 58.6
10 哈爾濱工業大學 黑龍江 57.4
11 同濟大學 上海 56.4
12 武漢大學 湖北 55.5
13 東南大學 江蘇 55.3
14 西安交通大學 陝西 54.2
15 北京航空航天大學 北京 54.0
16 南開大學 天津 53.9
17 四川大學 四川 53.3
18 天津大學 天津 52.4
19 華南理工大學 廣東 51.8
20 北京師範大學 北京 51.7
21 北京理工大學 北京 51.1
22 廈門大學 福建 50.9
23 吉林大學 吉林 50.2
24 山東大學 山東 50.0
25 大連理工大學 遼寧 49.7
26 中南大學 湖南 49.5
27 蘇州大學 江蘇 48.8
28 對外經濟貿易大學 北京 47.7
29 西北工業大學 陝西 47.6
30 中國人民大學 北京 47.5
31 湖南大學 湖南 47.4
32 華東師範大學 上海 46.5
33 電子科技大學 四川 46.4
34 華東理工大學 上海 45.5
35 重慶大學 重慶 45.2
35 南京航空航天大學 江蘇 45.2
37 北京科技大學 北京 44.5
37 南京理工大學 江蘇 44.5
39 上海財經大學 上海 44.3
40 中國農業大學 北京 43.7
41 上海大學 上海 43.6
42 東北大學 遼寧 43.5
43 華中師範大學 湖北 43.3
43 南方科技大學 廣東 43.3
45 北京交通大學 北京 43.0
46 首都醫科大學 北京 42.9
47 武漢理工大學 湖北 42.8
48 北京化工大學 北京 42.4
48 北京郵電大學 北京 42.4
48 東華大學 上海 42.4
51 北京外國語大學 北京 42.1
52 天津醫科大學 天津 42.0
52 中央財經大學 北京 42.0
54 西安電子科技大學 陝西 41.9
55 南京醫科大學 江蘇 41.7
56 暨南大學 廣東 41.6
57 蘭州大學 甘肅 41.4
58 江南大學 江蘇 40.8
59 華北電力大學 北京 40.5
60 中國海洋大學 山東 40.3
61 哈爾濱工程大學 黑龍江 40.2
61 中國地質大學(武漢) 湖北 40.2
63 華中農業大學 湖北 40.1
63 南京師範大學 江蘇 40.1
65 東北師範大學 吉林 40.0
66 西南財經大學 四川 39.9
67 福州大學 福建 39.8
67 中國藥科大學 江蘇 39.8
69 中國地質大學(北京) 北京 39.7
70 上海外國語大學 上海 39.6
71 南京農業大學 江蘇 39.5
72 北京工業大學 北京 39.2
72 河海大學 江蘇 39.2
74 西南交通大學 四川 39.1
74 中國醫科大學 遼寧 39.1
76 西南大學 重慶 39.0
77 南方醫科大學 廣東 38.8
77 中南財經政法大學 湖北 38.8
79 南京信息工程大學 江蘇 38.4
80 江蘇大學 江蘇 38.3
80 中國石油大學(華東) 山東 38.3
82 合肥工業大學 安徽 38.2
83 上海中醫藥大學 上海 38.1
83 中國礦業大學 江蘇 38.1
85 浙江工業大學 浙江 38.0
86 北京中醫藥大學 北京 37.9
86 華僑大學 福建 37.9
86 西北農林科技大學 陝西 37.9
89 北京林業大學 北京 37.8
89 東北財經大學 遼寧 37.8
91 南京郵電大學 江蘇 37.7
91 深圳大學 廣東 37.7
91 中央民族大學 北京 37.7
94 南京工業大學 江蘇 37.6
94 中國政法大學 北京 37.6
96 大連醫科大學 遼寧 37.5
97 中國石油大學(北京) 北京 37.3
98 西北大學 陝西 37.2
98 中國傳媒大學 北京 37.2
100 寧波大學 浙江 36.8