regexcss
靈活方便的網頁解析庫,高效,支持多種解析器。利用bs不用編寫正則表達式便可方便地實現網頁信息的提取html
解析器 | 使用方法 | 優點 | 劣勢 |
---|---|---|---|
python標準庫 | BeautifulSoup(markup, 'html.parser') | python內置標準庫,執行速度適中,文檔容錯能力強 | python2.7.3或3.2.2前的版本中文容錯能力差 |
lxml html解析 | BeautifulSoup(markup, 'lxml') | 速度快,文檔容錯能力強 | 須要安裝c語言庫 |
lxml xml解析 | BeautifulSoup(markup, 'xml') | 速度快,惟一支持xml的解析器 | 須要安裝c語言庫 |
html5lib | BeautifulSoup(markup, 'html5lib') | 最好的容錯性,以瀏覽器的方式解析文檔、生成HTML5格式的文檔 | 速度慢,不依賴外部擴展 |
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, "lxml")
print(soup.prettify())
print(soup.title.string())
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title.name)
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.atrs['name'])
print(soup.p['name'])
複製代碼
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.string)
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.head.title.string) // 這句是重點,能夠嵌套的往下去訪問節點
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.contents)
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children): // 全部的子節點
print(i, child)
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants): // 全部的子孫節點
print(i, child)
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.a.parent) // 父節點
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.parents)))
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
複製代碼
find_all(name, attrs, recursive, text, **kwargs)html5
可根據標籤名、屬性、內容查找文檔python
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all('ul'))
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.find_all('ul'):
print(ul.find_all('ul'))
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
複製代碼
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class='element'))
複製代碼
find(name, attrs, recursive, text, **kwargs)web
find返回單個元素,find_all返回全部的元素正則表達式
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find('ul'))
print(soup.find('page'))
複製代碼
find_parents() find_parent()api
find_next_siblings() find_next_sibling瀏覽器
find_previous_siblings() find_previous_sibling()cookie
find_all_next() find_next()python2.7
find_all_previous() 和find_previous()
經過select()直接傳入css選擇期便可完成選擇
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.select('.panel-body'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
複製代碼
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select("li"):
print(ui.get_text())
複製代碼
強大又靈活的網頁解析庫,若是熟悉juqery的話,能夠很快接入pyquery。不用寫麻煩的正則
pip install pyquery
html=""" <div> <ul> <li class="item-0">first item</li> <li class='item-1'><a href="link2.html">second item</a></li> <li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li> <li class='item-1 active'>< a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """
from pyquery import PyQuery as pq
doc = pq(html)
print(doc("li"))
複製代碼
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
複製代碼
from pyquery import PyQuery as pq
doc = pq(filename="demo.html")
print(doc('li'))
複製代碼
html=""" <div id="container"> <ul> <li class="item-0">first item</li> <li class='item-1'><a href="link2.html">second item</a></li> <li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li> <li class='item-1 active'>< a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """
from pyquery import PyQeury as pq
doc = pq(html)
print(doc("#container ul .item-0"))
複製代碼
find函數 find("li")
children()孩子節點
parent() 父元素
parents() 祖先節點
siblings() 全部兄弟元素
items() 全部元素
attr(name) 屬性
text() 文本
html() 獲取html內容
addClass(name) 添加css class
removeClass(name) 移除css class
attr("name", "link") 修改屬性值
css("font-size", "14px") 設置css值
item.remove() 移除元素
pyquery.readthedocs.io
自動化測試工具,支持多種瀏覽器,驅動多種瀏覽器能夠進行一系列的操做。爬蟲中主要用來解決JavaScript渲染問題。
pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
try:
browser.get("https://www.baidu.con")
input = browser.find_element_by_id("kw")
input.send_keys("Python")
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
browser.close()
複製代碼
from selenium import webdriver
browser = webdriver.Chrome()
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantonJS()
browser = webdriver.Safari()
複製代碼
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
print(browser.page_source)
browser.close()
複製代碼
單個元素
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input_first = browser.find_element_by_id('q')
input_second = browser.find_element_by_css_selector('#q')
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(inpout_firsta, input_second, input_third)
複製代碼
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID, 'q')
print(input_first)
browser.close()
複製代碼
find_elements_by_css_selector
find_elements
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input.send_keys("iPhone")
time.sleep(1)
input.clear()
input.send_keys('iPad')
button = browser.find_element_by_class_name('btn-search')
button.click()
複製代碼
更多操做: selenium-python.readthedocs.io/api.html
將動做附加到動做鏈中串行執行
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url= 'http://www.r
browser.switch_to_frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
複製代碼