Python_selenium_phantomjs動態抓取

selenium:https://github.com/SeleniumHQ...
當前版本3.0.1
A browser automation framework and ecosystemjavascript

phantomjs:http://phantomjs.org/
是一個服務器端的 JavaScript API 的 WebKit。也能夠說是無界面瀏覽器。其支持各類Web標準: DOM 處理, CSS 選擇器, JSON, Canvas, 和 SVG.html

大部分的網頁抓取用urllib均可以搞定,可是涉及到JavaScript及Ajax渲染的時候,urlopen就徹底傻逼了,因此不得不用模擬瀏覽器,方法也有不少,此處採用的是selenium2+phantomjs
selenium2支持全部主流的瀏覽器和phantomjs這些無界面的瀏覽器。
安裝:java

pip install selenium

phantomjs不是python程序,去官網下載對應系統版本的安裝便可。python

from selenium import webdriver
import time
 
driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element_by_id("content").text)
driver.close()
from selenium import webdriver
 
driver = webdriver.PhantomJS(executable_path='C:\Users\Gentlyguitar\Desktop\phantomjs-1.9.7-windows\phantomjs.exe')
driver.set_window_size(1120, 550)
driver.get("http://duckduckgo.com/")
driver.find_element_by_id('search_form_input_homepage').send_keys("Nirvana")
driver.find_element_by_id("search_button_homepage").click()
print(driver.current_url)
driver.close()

get方法會一直等到頁面被徹底加載,而後纔會繼續程序,可是對於ajax是迫不得已的。
send_keys就是填充input表單git

等待頁面渲染完成

#等待頁面渲染完成
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
    print(driver.find_element_by_id("content").text)
    driver.close()

處理Javascript重定向

#處理Javascript重定向
from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
 
def waitForLoad(driver):
    elem = driver.find_element_by_tag_name("html")
    count = 0
    while True:
        count += 1
        if count > 20:
            print("Timing out after 10 seconds and returning")
            return
        time.sleep(.5)
        try:
            elem == driver.find_element_by_tag_name("html")
        #拋出StaleElementReferenceException異常說明elem元素已經消失了,也就說明頁面已經跳轉了。
        except StaleElementReferenceException:  
            return
 
driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)

設置PHANTOMJS的USER-AGENT

有些網站的WebServer對User-Agent有限制,可能會拒毫不熟悉的User-Agent的訪問。
設置PhantomJS的user-agent,是要設置「phantomjs.page.settings.userAgent」這個desired_capability.github

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
 
 
driver = webdriver.PhantomJS(executable_path='./phantomjs.exe', desired_capabilities=dcap)
driver.get("http://dianping.com/")
cap_dict = driver.desired_capabilities  #查看全部可用的desired_capabilities屬性。
for key in cap_dict:
    print '%s: %s' % (key, cap_dict[key])
print driver.current_url
driver.quit()

Demo

githubweb

#pip install selenium
#安裝phantomjs

from selenium import webdriver
import time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"

driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element_by_id("content").text)
driver.close()

#設置PHANTOMJS的USER-AGENT
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"

 
driver = webdriver.PhantomJS(executable_path='./phantomjs.exe', desired_capabilities=dcap)
driver.get("http://dianping.com/")

cap_dict = driver.desired_capabilities  #查看全部可用的desired_capabilities屬性。
for key in cap_dict:
    print('%s: %s' % (key, cap_dict[key]))
print(driver.current_url)
driver.quit()

#等待頁面渲染完成
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
    print(driver.find_element_by_id("content").text)
    driver.close()

#處理Javascript重定向
from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException

def waitForLoad(driver):
    elem = driver.find_element_by_tag_name("html")
    count = 0
    while True:
        count += 1
        if count > 20:
            print("Timing out after 10 seconds and returning")
            return
        time.sleep(.5)
        try:
            elem == driver.find_element_by_tag_name("html")
        except StaleElementReferenceException:
            return

driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
##################################################################################
#模擬拖拽
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver import ActionChains

driver = webdriver.PhantomJS(executable_path='phantomjs/bin/phantomjs')
driver.get('http://pythonscraping.com/pages/javascript/draggableDemo.html')

print(driver.find_element_by_id("message").text)

element = driver.find_element_by_id("draggable")
target = driver.find_element_by_id("div2")
actions = ActionChains(driver)
actions.drag_and_drop(element, target).perform()

print(driver.find_element_by_id("message").text)
##################################################################################
#截屏
driver.get_screenshot_as_file('tmp/pythonscraping.png')

####
##################################################################################
#登錄知乎,而後能自動點擊頁面下方的「更多」,以載入更多的內容
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
import time
import sys

driver = webdriver.PhantomJS(executable_path='C:\Users\Gentlyguitar\Desktop\phantomjs-1.9.7-windows\phantomjs.exe')
driver.get("http://www.zhihu.com/#signin")
#driver.find_element_by_name('email').send_keys('your email')
driver.find_element_by_xpath('//input[@name="password"]').send_keys('your password')
#driver.find_element_by_xpath('//input[@name="password"]').send_keys(Keys.RETURN)
time.sleep(2)
driver.get_screenshot_as_file('show.png')
#driver.find_element_by_xpath('//button[@class="sign-button"]').click()
driver.find_element_by_xpath('//form[@class="zu-side-login-box"]').submit()

try:
    #等待頁面加載完畢
    dr=WebDriverWait(driver,5)
    dr.until(lambda the_driver:the_driver.find_element_by_xpath('//a[@class="zu-top-nav-userinfo "]').is_displayed())
except:
    print('登陸失敗')
    sys.exit(0)
driver.get_screenshot_as_file('show.png')
#user=driver.find_element_by_class_name('zu-top-nav-userinfo ')
#webdriver.ActionChains(driver).move_to_element(user).perform() #移動鼠標到個人用戶名
loadmore=driver.find_element_by_xpath('//a[@id="zh-load-more"]')
actions = ActionChains(driver)
actions.move_to_element(loadmore)
actions.click(loadmore)
actions.perform()
time.sleep(2)
driver.get_screenshot_as_file('show.png')
print(driver.current_url)
print(driver.page_source)
driver.quit()
##################################################################################

參考:
http://www.cnblogs.com/chenqi...
http://www.realpython.com/blo...
http://selenium-python.readth...
http://www.cnblogs.com/paisen...
http://smilejay.com/2013/12/s...
更多參考:
selenium webdriver的各類driverajax

相關文章
相關標籤/搜索