selenium:css
-- 概念:一個基於瀏覽器自動化的模塊html
-- 基本使用流程:web
-- pip install seleniumchrome
-- 下載對應驅動程序:http://chromedriver.storage.googleapis.com/index.htmlapi
-- 實例化一個瀏覽器對象,將瀏覽器的驅動程序加載到該對象中瀏覽器
from selenium import webdriver from lxml import etree import time # 實例化一個瀏覽器對象,executable_table是chromedrive.exe的路徑 bro = webdriver.Chrome(executable_path='chromedriver.exe') # 讓瀏覽器對指定url發起訪問 bro.get('http://125.35.6.84:81/xk/') # 獲取頁面源碼(可見便可得) page_text = bro.page_source tree = etree.HTML(page_text) # 能夠獲取動態加載的數據 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()
打開淘寶並搜索相關內容:less
from selenium import webdriver import time bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.taobao.com') # 標籤訂位:find系列方法 input_text = bro.find_element_by_id('q') input_text.send_keys('mac') time.sleep(2) # 執行js程序 bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') btn = bro.find_element_by_css_selector('.btn-search') btn.click() time.sleep(3) bro.quit()
經常使用方法:函數
get(url)ui
find系列函數進行標籤訂位google
send_keys('key')
click()
excute_script('js_code')
page_source
switch_to.frame('iframe_ID')
quite()
save_screenshot()
a = ActionChains(bro) a.click_and_hold(tag)
tag.move_by_offset(x,y).perform
from selenium import webdriver from lxml import etree from selenium.webdriver import ChromeOptions import time # 用來規避檢測 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option) # 讓瀏覽器對指定url發起訪問 bro.get('http://125.35.6.84:81/xk/') # 獲取頁面源碼(可見便可得) page_text = bro.page_source tree = etree.HTML(page_text) # 能夠獲取動態加載的數據 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()
設置爲在瀏覽器不可見下進行爬取:
from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree import time chrome_options = Options() # 設置不可見 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options) # 讓瀏覽器對指定url發起訪問 bro.get('http://125.35.6.84:81/xk/') # 獲取頁面源碼(可見便可得) page_text = bro.page_source time.sleep(2) tree = etree.HTML(page_text) # 能夠獲取動態加載的數據 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()