1.安裝方法css
pip install pyquery
2.引用方法html
from pyquery import PyQuery as pq
3.簡介python
pyquery 是類型jquery 的一個專供python使用的html解析的庫,使用方法相似bs4。jquery
4.使用方法api
4.1 初始化方法:spa
from pyquery import PyQuery as pq doc =pq(html) #解析html字符串 doc =pq("http://news.baidu.com/") #解析網頁 doc =pq("./a.html") #解析html 文本
4.2 基本CSS選擇器code
from pyquery import PyQuery as pq html = ''' <div id="wrap"> <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) print doc("#wrap .s_from link")
運行結果:htm
<link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link>
#是查找id的標籤 .是查找class 的標籤 link 是查找link 標籤 中間的空格表示裏層對象
4.3 查找子元素blog
from pyquery import PyQuery as pq html = ''' <div id="wrap"> <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' #查找子元素 doc = pq(html) items=doc("#wrap") print(items) print("類型爲:%s"%type(items)) link = items.find('.s_from') print(link) link = items.children() print(link)
運行結果:
<div id="wrap"> <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> </div> 類型爲:<class 'pyquery.pyquery.PyQuery'> <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul>
根據運行結果能夠發現返回結果類型爲pyquery,而且find方法和children 方法均可以獲取裏層標籤
4.4查找父元素
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) items=doc(".s_from") print(items) #查找父元素 parent_href=items.parent() print(parent_href)
運行結果:
<ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link href="http://asda.com">asdadasdad12312</link> <link href="http://asda1.com">asdadasdad12312</link> <link href="http://asda2.com">asdadasdad12312</link> </ul> </div>
parent能夠查找出外層標籤包括的內容,與之相似的還有parents,能夠獲取全部外層節點
4.5 查找兄弟元素
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) items=doc("link.active1.a123") print(items) #查找兄弟元素 siblings_href=items.siblings() print(siblings_href)
運行結果:
<link class="active1 a123" href="http://asda.com">asdadasdad12312</link> <link class="active2" href="http://asda1.com">asdadasdad12312</link> <link class="movie1" href="http://asda2.com">asdadasdad12312</link>
根據運行結果能夠看出,siblings 返回了同級的其餘標籤
結論:子元素查找,父元素查找,兄弟元素查找,這些方法返回的結果類型都是pyquery類型,能夠針對結果再次進行選擇
4.6 遍歷查找結果
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link").items() for it in its: print(it)
運行結果:
<link class="active1 a123" href="http://asda.com">asdadasdad12312</link> <link class="active2" href="http://asda1.com">asdadasdad12312</link> <link class="movie1" href="http://asda2.com">asdadasdad12312</link>
4.7獲取屬性信息
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link").items() for it in its: print(it.attr('href')) print(it.attr.href)
運行結果:
http://asda.com http://asda.com http://asda1.com http://asda1.com http://asda2.com http://asda2.com
4.8 獲取文本
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link").items() for it in its: print(it.text())
運行結果
asdadasdad12312
asdadasdad12312
asdadasdad12312
4.9 獲取 HTML信息
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link").items() for it in its: print(it.html())
運行結果:
<a>asdadasdad12312</a>
asdadasdad12312
asdadasdad12312
5.經常使用DOM操做
5.1 addClass removeClass
添加,移除class標籤
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link").items() for it in its: print("添加:%s"%it.addClass('active1')) print("移除:%s"%it.removeClass('active1'))
運行結果
添加:<link class="active1 a123" href="http://asda.com"><a>asdadasdad12312</a></link> 移除:<link class="a123" href="http://asda.com"><a>asdadasdad12312</a></link> 添加:<link class="active2 active1" href="http://asda1.com">asdadasdad12312</link> 移除:<link class="active2" href="http://asda1.com">asdadasdad12312</link> 添加:<link class="movie1 active1" href="http://asda2.com">asdadasdad12312</link> 移除:<link class="movie1" href="http://asda2.com">asdadasdad12312</link>
須要注意的是已經存在的class標籤不會繼續添加
5.2 attr css
attr 爲獲取/修改屬性 css 添加style屬性
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link").items() for it in its: print("修改:%s"%it.attr('class','active')) print("添加:%s"%it.css('font-size','14px'))
運行結果
C:\Python27\python.exe D:/test_his/test_re_1.py 修改:<link class="active" href="http://asda.com"><a>asdadasdad12312</a></link> 添加:<link class="active" href="http://asda.com" style="font-size: 14px"><a>asdadasdad12312</a></link> 修改:<link class="active" href="http://asda1.com">asdadasdad12312</link> 添加:<link class="active" href="http://asda1.com" style="font-size: 14px">asdadasdad12312</link> 修改:<link class="active" href="http://asda2.com">asdadasdad12312</link> 添加:<link class="active" href="http://asda2.com" style="font-size: 14px">asdadasdad12312</link>
attr css操做直接修改對象的
5.3 remove
remove 移除標籤
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("div") print('移除前獲取文本結果:\n%s'%its.text()) it=its.remove('ul') print('移除後獲取文本結果:\n%s'%it.text())
運行結果
移除前獲取文本結果:
hello nihao
asdasd
asdadasdad12312
asdadasdad12312
asdadasdad12312
移除後獲取文本結果:
hello nihao
其餘DOM方法參考:
http://pyquery.readthedocs.io/en/latest/api.html
6.僞類選擇器
from pyquery import PyQuery as pq html = ''' <div href="wrap"> hello nihao <ul class="s_from"> asdasd <link class='active1 a123' href="http://asda.com"><a>helloasdadasdad12312</a></link> <link class='active2' href="http://asda1.com">asdadasdad12312</link> <link class='movie1' href="http://asda2.com">asdadasdad12312</link> </ul> </div> ''' doc = pq(html) its=doc("link:first-child") print('第一個標籤:%s'%its) its=doc("link:last-child") print('最後一個標籤:%s'%its) its=doc("link:nth-child(2)") print('第二個標籤:%s'%its) its=doc("link:gt(0)") #從零開始 print("獲取0之後的標籤:%s"%its) its=doc("link:nth-child(2n-1)") print("獲取奇數標籤:%s"%its) its=doc("link:contains('hello')") print("獲取文本包含hello的標籤:%s"%its)
運行結果
第一個標籤:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link> 最後一個標籤:<link class="movie1" href="http://asda2.com">asdadasdad12312</link> 第二個標籤:<link class="active2" href="http://asda1.com">asdadasdad12312</link> 獲取0之後的標籤:<link class="active2" href="http://asda1.com">asdadasdad12312</link> <link class="movie1" href="http://asda2.com">asdadasdad12312</link> 獲取奇數標籤:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link> <link class="movie1" href="http://asda2.com">asdadasdad12312</link> 獲取文本包含hello的標籤:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>
更多css選擇器能夠查看: