python pyquery 基本用法

1.安裝方法css

pip install pyquery

2.引用方法html

from pyquery import PyQuery as pq

3.簡介python

 pyquery 是類型jquery 的一個專供python使用的html解析的庫,使用方法相似bs4。jquery

4.使用方法api

  4.1 初始化方法:spa

from pyquery import PyQuery as pq
doc =pq(html) #解析html字符串
doc =pq("http://news.baidu.com/") #解析網頁
doc =pq("./a.html") #解析html 文本

      4.2 基本CSS選擇器code

from pyquery import PyQuery as pq
html = '''
    <div id="wrap">
        <ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''
doc = pq(html)
print doc("#wrap .s_from link")

  運行結果:htm

<link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>

  #是查找id的標籤  .是查找class 的標籤  link 是查找link 標籤 中間的空格表示裏層對象

  4.3 查找子元素blog

from pyquery import PyQuery as pq
html = '''
    <div id="wrap">
        <ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''
#查找子元素
doc = pq(html)
items=doc("#wrap")
print(items)
print("類型爲:%s"%type(items))
link = items.find('.s_from')
print(link)
link = items.children()
print(link)

  運行結果:

<div id="wrap">
        <ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
類型爲:<class 'pyquery.pyquery.PyQuery'>
<ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    
<ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>

  根據運行結果能夠發現返回結果類型爲pyquery,而且find方法和children 方法均可以獲取裏層標籤

  4.4查找父元素

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
items=doc(".s_from")
print(items)
#查找父元素
parent_href=items.parent()
print(parent_href)

  運行結果:

<ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    
<div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link href="http://asda.com">asdadasdad12312</link>
            <link href="http://asda1.com">asdadasdad12312</link>
            <link href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>

  parent能夠查找出外層標籤包括的內容,與之相似的還有parents,能夠獲取全部外層節點

  4.5 查找兄弟元素

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
items=doc("link.active1.a123")
print(items)
#查找兄弟元素
siblings_href=items.siblings()
print(siblings_href)

  運行結果:

<link class="active1 a123" href="http://asda.com">asdadasdad12312</link>
            
<link class="active2" href="http://asda1.com">asdadasdad12312</link>
            <link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  根據運行結果能夠看出,siblings 返回了同級的其餘標籤

  結論:子元素查找,父元素查找,兄弟元素查找,這些方法返回的結果類型都是pyquery類型,能夠針對結果再次進行選擇

  4.6 遍歷查找結果

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link").items()
for it in its:
    print(it)

  運行結果:

<link class="active1 a123" href="http://asda.com">asdadasdad12312</link>
            
<link class="active2" href="http://asda1.com">asdadasdad12312</link>
            
<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  4.7獲取屬性信息

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link").items()
for it in its:
    print(it.attr('href'))
    print(it.attr.href)

  運行結果:

http://asda.com
http://asda.com
http://asda1.com
http://asda1.com
http://asda2.com
http://asda2.com

  4.8 獲取文本

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link").items()
for it in its:
    print(it.text())

  運行結果

asdadasdad12312
asdadasdad12312
asdadasdad12312

  4.9 獲取 HTML信息

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link").items()
for it in its:
    print(it.html())

  運行結果:

<a>asdadasdad12312</a>
asdadasdad12312
asdadasdad12312

 

5.經常使用DOM操做

  5.1 addClass removeClass

  添加,移除class標籤

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link").items()
for it in its:
    print("添加:%s"%it.addClass('active1'))
    print("移除:%s"%it.removeClass('active1'))

  運行結果

添加:<link class="active1 a123" href="http://asda.com"><a>asdadasdad12312</a></link>
            
移除:<link class="a123" href="http://asda.com"><a>asdadasdad12312</a></link>
            
添加:<link class="active2 active1" href="http://asda1.com">asdadasdad12312</link>
            
移除:<link class="active2" href="http://asda1.com">asdadasdad12312</link>
            
添加:<link class="movie1 active1" href="http://asda2.com">asdadasdad12312</link>
        
移除:<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  須要注意的是已經存在的class標籤不會繼續添加

  5.2 attr css

  attr 爲獲取/修改屬性 css 添加style屬性

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link").items()
for it in its:
    print("修改:%s"%it.attr('class','active'))
    print("添加:%s"%it.css('font-size','14px'))

  運行結果

C:\Python27\python.exe D:/test_his/test_re_1.py
修改:<link class="active" href="http://asda.com"><a>asdadasdad12312</a></link>
            
添加:<link class="active" href="http://asda.com" style="font-size: 14px"><a>asdadasdad12312</a></link>
            
修改:<link class="active" href="http://asda1.com">asdadasdad12312</link>
            
添加:<link class="active" href="http://asda1.com" style="font-size: 14px">asdadasdad12312</link>
            
修改:<link class="active" href="http://asda2.com">asdadasdad12312</link>
        
添加:<link class="active" href="http://asda2.com" style="font-size: 14px">asdadasdad12312</link>

  attr css操做直接修改對象的

  5.3 remove

  remove 移除標籤

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("div")
print('移除前獲取文本結果:\n%s'%its.text())
it=its.remove('ul')
print('移除後獲取文本結果:\n%s'%it.text())

  運行結果

移除前獲取文本結果:
hello nihao
asdasd
asdadasdad12312
asdadasdad12312
asdadasdad12312
移除後獲取文本結果:
hello nihao

  其餘DOM方法參考:

  http://pyquery.readthedocs.io/en/latest/api.html

6.僞類選擇器

 

from pyquery import PyQuery as pq
html = '''
    <div href="wrap">
        hello nihao
        <ul class="s_from">
            asdasd
            <link class='active1 a123' href="http://asda.com"><a>helloasdadasdad12312</a></link>
            <link class='active2' href="http://asda1.com">asdadasdad12312</link>
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link>
        </ul>
    </div>
'''

doc = pq(html)
its=doc("link:first-child")
print('第一個標籤:%s'%its)
its=doc("link:last-child")
print('最後一個標籤:%s'%its)
its=doc("link:nth-child(2)")
print('第二個標籤:%s'%its)
its=doc("link:gt(0)") #從零開始
print("獲取0之後的標籤:%s"%its)
its=doc("link:nth-child(2n-1)")
print("獲取奇數標籤:%s"%its)
its=doc("link:contains('hello')")
print("獲取文本包含hello的標籤:%s"%its)

 

  運行結果

第一個標籤:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>
            
最後一個標籤:<link class="movie1" href="http://asda2.com">asdadasdad12312</link>
        
第二個標籤:<link class="active2" href="http://asda1.com">asdadasdad12312</link>
            
獲取0之後的標籤:<link class="active2" href="http://asda1.com">asdadasdad12312</link>
            <link class="movie1" href="http://asda2.com">asdadasdad12312</link>
        
獲取奇數標籤:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>
            <link class="movie1" href="http://asda2.com">asdadasdad12312</link>
        
獲取文本包含hello的標籤:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>

  更多css選擇器能夠查看:

  http://www.w3school.com.cn/css/index.asp

相關文章
相關標籤/搜索