selenium爬蟲

時間 2019-11-08

原文原文鏈接

Web自動化測試工具，可運行在瀏覽器，根據指令操做瀏覽器，只是工具，必須與第三方瀏覽器結合使用，相比於以前學的爬蟲只是慢了一點而已。並且這種方法爬取的東西不用在乎時候ajax動態加載等反爬機制。所以找標籤能夠直接F12找，不用肯定源碼中是否存在。html

安裝python

Linux: sudo pip3 install seleniummysql

Windows: python -m pip install seleniumgit

phantomjs瀏覽器

phantomjs瀏覽器又叫作無界面瀏覽器(又稱無頭瀏覽器)，在內存中進行頁面加載，運行高效。github

安裝(phantomjs(無界面瀏覽器)、chromedriver(谷歌瀏覽器)、geckodriver(火狐瀏覽器))web

Windowsajax

一、下載對應版本的phantomjs、chromedriver、geckodriversql

二、chromedriver下載與谷歌瀏覽器對應的版本，把chromedriver.exe拷貝到python安裝目錄的Scripts目錄下(添加到系統環境變量)，查看python安裝路徑: where pythonchrome

三、驗證，cmd命令行: chromedriver數據庫

Linux

一、下載後解壓：tar -zxvf geckodriver.tar.gz

二、拷貝解壓後文件到 /usr/bin/ （添加環境變量）：sudo cp geckodriver /usr/bin/

三、更改權限

sudo -i

cd /usr/bin/

chmod 777 geckodriver

示例代碼一：使用 selenium+谷歌瀏覽器打開百度，並截圖百度首頁

from selenium import webdriver

browser = webdriver.Chrome()            # 建立瀏覽器對象
browser.get('http://www.baidu.com/')    # 打開百度
browser.save_screenshot('baidu.png')    # 截屏
browser.quit()      # 退出瀏覽器

示例代碼二：打開百度，搜索趙麗穎

from selenium import webdriver
import time

# 建立瀏覽器對象 - 已經打開了瀏覽器
browser = webdriver.Chrome()
browser.get('http://www.baidu.com/')        # 打開百度
ele = browser.find_element_by_xpath('//*[@id="kw"]')        # 找到搜索框
ele.send_keys('趙麗穎')      # 向搜索框發送文字: 趙麗穎

time.sleep(1)
# 找到 百度一下 按鈕,點擊一下
browser.find_element_by_xpath('//*[@id="su"]').click()
time.sleep(2)
browser.quit()      # 關閉瀏覽器

browser瀏覽器對象方法

browser = webdriver.Chrome(executable_path='path') path爲瀏覽器驅動地址
browser.get(url) 打開path路徑
browser.page_source：查看響應內容（網頁源代碼）
browser.page_source.find('字符串')：從html源碼中搜索指定字符串,沒有找到返回：-1
browser.quit()：關閉瀏覽器

元素查找

單元素查找(1個節點對象)

browser.find_element_by_id('')
browser.find_element_by_name('')
browser.find_element_by_class_name('')
browser.find_element_by_xpath('')
browser.find_element_by_link_text('')
... ...

多元素查找([節點對象列表])

browser.find_elements_by_id('')
browser.find_elements_by_name('')
browser.find_elements_by_class_name('')
browser.find_elements_by_xpath('')
... ...

節點對象操做

.send_keys('') 搜索框發送內容
.click()　　　　點擊
.text 獲取文本內容
.get_attribute('src') 獲取屬性值
.find("") 　　查找響應中的字符串

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.qiushibaike.com/text/')

# 單元素查找
div = browser.find_element_by_class_name('content')
print(div.text)

# 多元素查找: [<selenium xxx at xxx>,<selenium xxx >]
divs = browser.find_elements_by_class_name('content')
for div in divs:
    print('*************************')
    print(div.text)
    print('*************************')

browser.quit()  # 退出瀏覽器

京東爬蟲案例

目標網址：https://www.jd.com/
抓取目標：商品名稱、商品價格、評價數量、商品商家

思路提醒

打開京東，到商品搜索頁
匹配全部商品節點對象列表
把節點對象的文本內容取出來，查看規律，是否有更好的處理辦法？
提取完1頁後，判斷若是不是最後1頁，則點擊下一頁

實現步驟

找節點

首頁搜索框 : //*[@id="key"]
首頁搜索按鈕 ://*[@id="search"]/div/div[2]/button
商品頁的商品信息節點對象列表 ://*[@id="J_goodsList"]/ul/li

執行JS腳本，獲取動態加載數據

　　browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')

from selenium import webdriver
import time


class JdSpider(object):
    def __init__(self):
        self.i = 0
        self.url = 'https://www.jd.com/'
        self.browser = webdriver.Chrome()

    # 獲取頁面信息 - 到具體商品的頁面
    def get_html(self):
        self.browser.get(self.url)
        self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('爬蟲書')  # 搜索框輸入「爬蟲書」
        self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()  # 點擊搜索
        time.sleep(3)  # 給商品頁面加載時間

    # 解析頁面
    def parse_html(self):
        # 把下拉菜單拉到底部,執行JS腳本
        self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(2)
        # 提取全部商品節點對象列表 li列表
        li_list = self.browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
        for li in li_list:
            info_list = li.text.split('\n')
            if info_list[0].startswith('每滿') or info_list[1].startswith('￥'):
                price = info_list[1]
                name = info_list[2]
                comment = info_list[3]
                shop = info_list[4]
            elif info_list[0].startswith('單件'):
                price = info_list[3]
                name = info_list[4]
                comment = info_list[5]
                shop = info_list[6]
            else:
                price = info_list[0]
                name = info_list[1]
                comment = info_list[2]
                shop = info_list[3]

            print(price, comment, shop, name)

    # 主函數
    def main(self):
        self.get_html()
        while True:
            self.parse_html()
            # 判斷是否該點擊下一頁,沒有找到說明不是最後一頁
            if self.browser.page_source.find('pn-next disabled') == -1:
                self.browser.find_element_by_class_name('pn-next').click()
                time.sleep(2)
            else:
                break
        print(self.i)


if __name__ == '__main__':
    spider = JdSpider()
    spider.main()

chromedriver設置無界面模式

from selenium import webdriver

options = webdriver.ChromeOptions()   # 設置無界面
options.add_argument('--headless')   # 添加無界面參數
browser = webdriver.Chrome(options=options)
browser.get('http://www.baidu.com/')
browser.save_screenshot('baidu.png')
browser.quit()

把上面的代碼改成無界面模式

from selenium import webdriver
import time


class JdSpider(object):
    def __init__(self):
        self.url = 'https://www.jd.com/'
        self.options = webdriver.ChromeOptions()  # 設置無界面
        self.options.add_argument('--headless')  # 添加無界面參數
        # 正常建立瀏覽器對象便可
        self.browser = webdriver.Chrome(options=self.options)
        self.i = 0  # 統計商品數

    # 獲取頁面信息 - 到具體商品的頁面
    def get_html(self):
        self.browser.get(self.url)
        self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('爬蟲書')  # 搜索框輸入「爬蟲書」
        self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()  # 點擊搜索
        time.sleep(3)  # 給商品頁面加載時間

    def parse_html(self):
        # 把進度條拉到底部,使全部數據動態加載
        self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(2)  # 等待動態數據加載完成

        # 提取全部商品節點對象列表 li列表
        li_list = self.browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
        item = {}
        for li in li_list:
            # find_element: 查找單元素
            item['name'] = li.find_element_by_xpath('.//div[@class="p-name"]/a/em').text.strip()
            item['price'] = li.find_element_by_xpath('.//div[@class="p-price"]').text.strip()
            item['comment'] = li.find_element_by_xpath('.//div[@class="p-commit"]/strong').text.strip()
            item['shop'] = li.find_element_by_xpath('.//div[@class="p-shopnum"]').text.strip()

            print(item)
            self.i += 1

    def main(self):
        self.get_html()
        while True:
            self.parse_html()
            # 判斷是否爲最後一頁
            if self.browser.page_source.find('pn-next disabled') == -1:
                self.browser.find_element_by_class_name('pn-next').click()
                time.sleep(3)
            else:
                break
        print('商品數量:', self.i)
        self.browser.quit()


if __name__ == '__main__':
    spider = JdSpider()
    spider.main()

View Code

鍵盤操做

from selenium.webdriver.common.keys import Keys

browser = webdriver.Chrome()
browser.get('http://www.baidu.com/')
# 一、在搜索框中輸入"selenium"
browser.find_element_by_id('kw').send_keys('趙麗穎')
# 二、輸入空格
browser.find_element_by_id('kw').send_keys(Keys.SPACE)
# 三、Ctrl+a 模擬全選
browser.find_element_by_id('kw').send_keys(Keys.CONTROL, 'a')
# 四、Ctrl+c 模擬複製
browser.find_element_by_id('kw').send_keys(Keys.CONTROL, 'c')
# 五、Ctrl+v 模擬粘貼
browser.find_element_by_id('kw').send_keys(Keys.CONTROL, 'v')
# 六、輸入回車,代替 搜索 按鈕
browser.find_element_by_id('kw').send_keys(Keys.ENTER)

鼠標操做

import time
from selenium import webdriver
# 導入鼠標事件
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get('http://www.baidu.com/')

# 找到「設置」節點
element = browser.find_element_by_xpath('//*[@id="u1"]/a[8]')

# 把鼠標移動到 設置 節點,move_to_element()
actions = ActionChains(browser)
actions.move_to_element(element)
actions.perform()       # perform()是真正執行操做
time.sleep(1)
# 找到高級設置節點,並點擊
browser.find_element_by_link_text('高級搜索').click()

切換頁面

適用與頁面中點開連接出現新的頁面的網站，可是瀏覽器對象browser仍是以前頁面的對象

all_handles = browser.window_handles　　獲取當前全部句柄（窗口）

browser.switch_to_window(all_handles[1])　　切換browser到新的窗口，獲取新窗口對象

民政部網站

將民政區劃代碼爬取到數據庫中，按照層級關係（分表 -- 省表、市表、縣表）

數據庫中建表

# 建庫
create database govdb charset utf8;
use govdb;
# 建表 create table province(
        p_name varchar(20),
        p_code varchar(20)
        )charset=utf8;
        create table city(
        c_name varchar(20),
        c_code varchar(20),
        c_father_code varchar(20)
        )charset=utf8;
        create table county(
        x_name varchar(20),
        x_code varchar(20),
        x_father_code varchar(20)
        )charset=utf8;

思路

selenium+Chrome打開一級頁面，並提取二級頁面最新連接
增量爬取: 和數據庫version表中進行比對，肯定以前是否爬過（是否有更新）
若是沒有更新，直接提示用戶，無須繼續爬取
若是有更新，則刪除以前表中數據，從新爬取並插入數據庫表
最終完成後: 斷開數據庫鏈接，關閉瀏覽器

from selenium import webdriver
import pymysql


class GovSpider(object):
    def __init__(self):
        # 設置無界面
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        self.browser = webdriver.Chrome(options=options)  # 添加參數
        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
        # 建立數據庫和相關變量
        self.db = pymysql.connect('localhost', 'root', '123456', 'govdb', charset='utf8')
        self.cursor = self.db.cursor()
        # 建立3個列表,用來executemany()往3張表中插入記錄
        self.province_list = []
        self.city_list = []
        self.county_list = []

    # 獲取首頁,並提取二級頁面連接(虛假連接)
    def get_incr_url(self):
        self.browser.get(self.one_url)
        # 提取最新連接,判斷是否須要增量爬
        td = self.browser.find_element_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代碼")]')
        # 提取連接 和 數據庫中作比對,肯定是否須要怎倆那個抓取
        # get_attribute()會自動補全提取的連接
        two_url = td.get_attribute('href')
        # result爲返回的受影響的條數
        result = self.cursor.execute('select url from version where url=%s', [two_url])
        if result:
            print('無須爬取')
        else:
            td.click()
            # 切換句柄
            all_handlers = self.browser.window_handles
            self.browser.switch_to.window(all_handlers[1])
            self.get_data()  # 數據抓取
            # 把URL地址存入version表
            self.cursor.execute('delete from version')
            self.cursor.execute('insert into version values(%s)', [two_url])
            self.db.commit()

    # 二級頁面中提取行政區劃代碼
    def get_data(self):
        # 基準xpath
        tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]')
        for tr in tr_list:
            code = tr.find_element_by_xpath('./td[2]').text.strip()
            name = tr.find_element_by_xpath('./td[3]').text.strip()
            print(name, code)
            # 判斷層級關係,添加到對應的數據庫表中(對應表中字段)
            # province: p_name p_code
            # city    : c_name c_code c_father_code
            # county  : x_name x_code x_father_code

            # 把數據添加到對應的表中
            if code[-4:] == '0000':
                self.province_list.append([name, code])
                if name in ['北京市', '天津市', '上海市', '重慶市']:
                    self.city_list.append([name, code, code])

            elif code[-2:] == '00':
                self.city_list.append([name, code, (code[:2] + '0000')])

            else:
                if code[:2] in ['11', '12', '31', '50']:
                    self.county_list.append([name, code, (code[:2] + '0000')])
                else:
                    self.county_list.append([name, code, (code[:4] + '00')])

        # # 和for循環同縮進,全部數據爬完後統一excutemany()，
        # 執行數據庫插入語句
        self.insert_mysql()

    def insert_mysql(self):
        # 1. 更新時必定要先刪除表記錄
        self.cursor.execute('delete from province')
        self.cursor.execute('delete from city')
        self.cursor.execute('delete from county')
        # 2. 插入新數據
        self.cursor.executemany('insert into province values(%s,%s)', self.province_list)
        self.cursor.executemany('insert into city values(%s,%s,%s)', self.city_list)
        self.cursor.executemany('insert into county values(%s,%s,%s)', self.county_list)
        # 3.提交到數據庫執行
        self.db.commit()
        print('數據抓取完成,成功存入數據庫')

    def main(self):
        self.get_incr_url()
        self.cursor.close()  # 全部數據處理完成後斷開鏈接
        self.db.close()
        self.browser.quit()  # 關閉瀏覽器


if __name__ == '__main__':
    spider = GovSpider()
    spider.main()

SQL命令練習

1. 查詢全部省市縣信息（多表查詢實現）

select province.p_name,city.c_name,county.x_name from province,city,county  where province.p_code=city.c_father_code and city.c_code=county.x_father_code;

2. 查詢全部省市縣信息（鏈接查詢實現）

select province.p_name,city.c_name,county.x_name from province inner join city on province.p_code=city.c_father_code inner join county on city.c_code=county.x_father_code;

Web客戶端驗證

在URL地址中填入便可

url = 'http://用戶名:密碼@正常地址'

示例: 爬取某一天筆記

from selenium import webdriver

url = 'http://tarenacode:code_2013@code.tarena.com.cn/AIDCode/aid1904/15-spider/spider_day06_note.zip'
browser = webdriver.Chrome()
browser.get(url)

iframe子框架

iframe子框架適用於網頁中嵌套了網頁，這種狀況應該先切換到iframe子框架，而後再執行其餘操做。

browser.switch_to.iframe(iframe_element)

示例 - 登陸qq郵箱

import time
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://mail.qq.com/cgi-bin/loginpage')

# 找iframe子框架並切換到此iframe
login_frame = browser.find_element_by_id('login_frame')
browser.switch_to.frame(login_frame)

# qq+密碼+登陸
browser.find_element_by_id('u').send_keys('帳號')
browser.find_element_by_id('p').send_keys('密碼')
browser.find_element_by_id('login_button').click()

time.sleep(5)   # 預留頁面記載時間

# 提取數據
ele = browser.find_element_by_id('useralias')
print(ele.text)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。