python爬蟲學習-day006

時間 2019-12-02

標籤 python 爬蟲學習 day006 day 欄目 Python 简体版

原文原文鏈接

正則表達式

樣例網站

regexcss

用法講解

BeautifulSoup庫詳解

靈活方便的網頁解析庫，高效，支持多種解析器。利用bs不用編寫正則表達式便可方便地實現網頁信息的提取html

安裝

pip install beautifullsoup4

詳細用法

解析庫

解析器	使用方法	優點	劣勢
python標準庫	BeautifulSoup(markup, 'html.parser')	python內置標準庫，執行速度適中，文檔容錯能力強	python2.7.3或3.2.2前的版本中文容錯能力差
lxml html解析	BeautifulSoup(markup, 'lxml')	速度快，文檔容錯能力強	須要安裝c語言庫
lxml xml解析	BeautifulSoup(markup, 'xml')	速度快，惟一支持xml的解析器	須要安裝c語言庫
html5lib	BeautifulSoup(markup, 'html5lib')	最好的容錯性，以瀏覽器的方式解析文檔、生成HTML5格式的文檔	速度慢，不依賴外部擴展

基本使用

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """

from bs4 import BeautifulSoup as bs
soup = bs(html, "lxml")
print(soup.prettify())
print(soup.title.string())
複製代碼

標籤選擇器

選擇元素

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)

複製代碼

獲取名稱

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title.name)
複製代碼

獲取屬性

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.atrs['name'])
print(soup.p['name'])
複製代碼

獲取內容

html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.string)
複製代碼

嵌套選擇

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.head.title.string) // 這句是重點，能夠嵌套的往下去訪問節點
複製代碼

子節點和子孫節點

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.contents)
複製代碼

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children): // 全部的子節點
    print(i, child)
複製代碼

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants): // 全部的子孫節點
    print(i, child)
複製代碼

父親和祖先節點

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.a.parent) // 父節點
複製代碼

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.parents)))
複製代碼

兄弟節點

html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
複製代碼

標準選擇器

find_all(name, attrs, recursive, text, **kwargs)html5

可根據標籤名、屬性、內容查找文檔python

name

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all('ul'))

複製代碼

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.find_all('ul'):
    print(ul.find_all('ul'))
複製代碼

attrs

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
複製代碼

html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class='element'))
複製代碼

find(name, attrs, recursive, text, **kwargs)web

find返回單個元素，find_all返回全部的元素正則表達式

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find('ul'))
print(soup.find('page'))
複製代碼

find_parents() find_parent()api
- find_parents() 返回全部祖先節點
- find_parent() 返回直接父親節點
find_next_siblings() find_next_sibling瀏覽器
- find_next_siblings() 返回後面全部兄弟節點
- find_next_sibling()返回後面第一個兄弟節點
find_previous_siblings() find_previous_sibling()cookie
- find_previous_siblings() 返回前面全部兄弟節點
- find_previous)sibling() 返回前面第一個兄弟節點
find_all_next() find_next()python2.7
- find_all_next() 返回節點後全部符合條件的節點
- find_next()返回第一個符合條件的節點
find_all_previous() 和find_previous()
- find_all_previous() 返回節點前全部符合條件的節點
- find_previous() 返回第一個符合條件的節點

CSS選擇器

經過select()直接傳入css選擇期便可完成選擇

html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.select('.panel-body'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
複製代碼

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.select('ul'):
    print(ul.select('li'))
複製代碼

獲取屬性

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select('ul'):
    print(ul['id'])
    print(ul.attrs['id'])
複製代碼

獲取內容

html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select("li"):
    print(ui.get_text())
複製代碼

總結

推薦使用lxml解析庫，必要時使用html.parser
標籤選擇篩選功能弱可是速度快
建議使用find()，find_all()查詢匹配單個結果或多個結果
若是對css選擇器熟悉建議使用select()
記住經常使用的獲取屬性和文本值的方法

PyQuery

強大又靈活的網頁解析庫，若是熟悉juqery的話，能夠很快接入pyquery。不用寫麻煩的正則

安裝

pip install pyquery

初始化

html=""" <div> <ul> <li class="item-0">first item</li> <li class='item-1'><a href="link2.html">second item</a></li> <li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li> <li class='item-1 active'>< a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """

from pyquery import PyQuery as pq
doc = pq(html)
print(doc("li"))
複製代碼

URL初始化

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
複製代碼

文件初始化

from pyquery import PyQuery as pq
doc = pq(filename="demo.html")
print(doc('li'))
複製代碼

基本CSS選擇器

html=""" <div id="container"> <ul> <li class="item-0">first item</li> <li class='item-1'><a href="link2.html">second item</a></li> <li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li> <li class='item-1 active'>< a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """
from pyquery import PyQeury as pq
doc = pq(html)
print(doc("#container ul .item-0"))
複製代碼

操做接口

find函數 find("li")
children()孩子節點
parent() 父元素
parents() 祖先節點
siblings() 全部兄弟元素
items() 全部元素
attr(name) 屬性
text() 文本
html() 獲取html內容
addClass(name) 添加css class
removeClass(name) 移除css class
attr("name", "link") 修改屬性值
css("font-size", "14px") 設置css值
item.remove() 移除元素

官方文檔

pyquery.readthedocs.io

Selenium庫

自動化測試工具，支持多種瀏覽器，驅動多種瀏覽器能夠進行一系列的操做。爬蟲中主要用來解決JavaScript渲染問題。

安裝

pip install selenium

用法

基本使用

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


browser = webdriver.Chrome()
try:
    browser.get("https://www.baidu.con")
    input = browser.find_element_by_id("kw")
	input.send_keys("Python")
    input.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
    print(browser.current_url)
    print(browser.get_cookies())
    print(browser.page_source)
finally:
    browser.close()
複製代碼

聲明瀏覽器對象

from selenium import webdriver

browser = webdriver.Chrome()
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantonJS()
browser = webdriver.Safari()
複製代碼

訪問頁面

from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
print(browser.page_source)
browser.close()
複製代碼

查找元素

單個元素

from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input_first = browser.find_element_by_id('q')
input_second = browser.find_element_by_css_selector('#q')
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(inpout_firsta, input_second, input_third)
複製代碼

from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID, 'q')
print(input_first)
browser.close()
複製代碼

多個元素

find_elements_by_css_selector

find_elements

元素交互

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input.send_keys("iPhone")
time.sleep(1)
input.clear()
input.send_keys('iPad')
button = browser.find_element_by_class_name('btn-search')
button.click()
複製代碼

交互動做

將動做附加到動做鏈中串行執行

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
url= 'http://www.r
browser.switch_to_frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')

複製代碼