selenium

selenium:css

  -- 概念:一個基於瀏覽器自動化的模塊html

  -- 基本使用流程:web

    -- pip install seleniumchrome

    -- 下載對應驅動程序:http://chromedriver.storage.googleapis.com/index.htmlapi

    -- 實例化一個瀏覽器對象,將瀏覽器的驅動程序加載到該對象中瀏覽器

  

1.簡單示例

from selenium import webdriver
from lxml import etree
import time

# 實例化一個瀏覽器對象,executable_table是chromedrive.exe的路徑
bro = webdriver.Chrome(executable_path='chromedriver.exe')
# 讓瀏覽器對指定url發起訪問
bro.get('http://125.35.6.84:81/xk/')
# 獲取頁面源碼(可見便可得)
page_text = bro.page_source
tree = etree.HTML(page_text)
# 能夠獲取動態加載的數據
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

2.相關行爲定製

打開淘寶並搜索相關內容:less

from selenium import webdriver
import time

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.taobao.com')
# 標籤訂位:find系列方法
input_text = bro.find_element_by_id('q')
input_text.send_keys('mac')
time.sleep(2)
# 執行js程序
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
btn = bro.find_element_by_css_selector('.btn-search')
btn.click()
time.sleep(3)
bro.quit()

 經常使用方法:函數

    get(url)ui

    find系列函數進行標籤訂位google

    send_keys('key')

    click()

    excute_script('js_code')

    page_source

    switch_to.frame('iframe_ID')

    quite()

    save_screenshot()

    a = ActionChains(bro)   a.click_and_hold(tag)

    tag.move_by_offset(x,y).perform

 

3.規避檢測

from selenium import webdriver
from lxml import etree
from selenium.webdriver import ChromeOptions
import time

# 用來規避檢測
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
# 讓瀏覽器對指定url發起訪問
bro.get('http://125.35.6.84:81/xk/')
# 獲取頁面源碼(可見便可得)
page_text = bro.page_source
tree = etree.HTML(page_text)
# 能夠獲取動態加載的數據
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

4.無頭瀏覽器

設置爲在瀏覽器不可見下進行爬取:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import time

chrome_options = Options()
# 設置不可見
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options)
# 讓瀏覽器對指定url發起訪問
bro.get('http://125.35.6.84:81/xk/')
# 獲取頁面源碼(可見便可得)
page_text = bro.page_source
time.sleep(2)
tree = etree.HTML(page_text)
# 能夠獲取動態加載的數據
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()
相關文章
相關標籤/搜索