Selenium support for PhantomJS has been deprecated, please use headless

  今天在使用Selenuim+PhantomJS動態抓取網頁時,出現以下報錯信息:php

  

C:\Python36\lib\site-packages\selenium-3.11.0-py3.6.egg\selenium\webdriver\phantomjs\webdriver.py:49: UserWarning: Selenium support for PhantomJS has been deprecated, please use headless versions of Chrome or Firefox instead
  warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless '

  意思就是Selenuim已經放棄PhantomJS,了,建議使用火狐或者谷歌無界面瀏覽器。html

  下載chromedriverpython

  

  

  

   要確保機器上安裝谷歌瀏覽器linux

   把chromedriver.exe放在C:\Python27\Scripts路徑下git

 

  Chrome-headless 模式, Google 針對 Chrome 瀏覽器 59版 新增長的一種模式,可讓你不打開UI界面的狀況下使用 Chrome 瀏覽器,因此運行效果與 Chrome 保持完美一致。github

  

  火狐驅動:https://github.com/mozilla/geckodriver/releasesweb

  https://github.com/mozilla/geckodriver/releases/download/v0.19.1/geckodriver-v0.19.1-linux64.tar.gzchrome

  Geckodriver版本與Firefox版本映射關係瀏覽器

  https://blog.csdn.net/u013250071/article/details/78803230less

   下載驅動後,能夠放在python27/scrpts目錄下,也能夠放在某個目錄,設置在環境變量path裏面

 具體實現代碼:

  

        chrome_options = Options()
     #Chrome-headless 模式, Google 針對 Chrome 瀏覽器 59版 新增長的一種模式,可讓你不打開UI界面的狀況下使用 Chrome 瀏覽器,因此運行效果與 Chrome 保持完美一致。 chrome_options.add_argument(
'--headless') chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) self.driver.set_page_load_timeout(10) self.driver.maximize_window()

  其它使用同 phantomjs 同樣

 

完整python代碼

# coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import datetime,timedelta
import time
from pyquery import PyQuery as pq
import re

import datetime

class consumer:

    def __init__(self):
        #經過配置文件獲取IEDriverServer.exe路徑
        # IEDriverServer ='C:\Program Files\Internet Explorer\IEDriverServer.exe'
        # self.driver = webdriver.Ie(IEDriverServer)
        # self.driver.maximize_window()
        # self.driver = webdriver.PhantomJS(service_args=['--load-images=false'])
        # self.driver = driver = webdriver.Chrome()

        # chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        # self.driver = webdriver.Chrome(chrome_options=chrome_options)

        options = webdriver.FirefoxOptions()
        options.set_headless()
        # options.add_argument('-headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Firefox(firefox_options=options)
        self.driver.set_page_load_timeout(10)
        self.driver.maximize_window()



    def WriteLog(self, message,date):
        fileName = os.path.join(os.getcwd(), 'consumer/' + date  +   '.txt')
        with open(fileName, 'a') as f:
            f.write(message)
    # http://search.cctv.com/search.php?qtext=消費主張&type=video
    def CatchData(self,url='http://search.cctv.com/search.php?qtext=%E6%B6%88%E8%B4%B9%E4%B8%BB%E5%BC%A0&type=video'):
        error = ''
        try:
            self.driver.get(url)

            selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
            doc = pq(selenium_html)

            filename = datetime.datetime.now().strftime('%Y-%m-%d')
            message = '{0},{1}'.format( '標題', '時間')
            filename = datetime.datetime.now().strftime('%Y-%m-%d')
            self.WriteLog(message, filename)
            pages = doc("div[class='page']").find("a")
            # 2018-06-05 00:12:21
            pattern = re.compile("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}")
            for index in range(1,6):
                url = "get_data('{0}', '消費主張', 'relevance', 'video', '-1', '1', '', '20', '1')".format(index)

                self.driver.execute_script(url)
                selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
                doc = pq(selenium_html)
                print(index)
                try:
                    Elements = doc("div[class='jvedio']").find("a")
                    for sub in Elements.items():
                        title = sub.attr('title')
                        print(title)
                        ts = pattern.findall(title)
                        strtime = ''
                        if ts and len(ts) == 1:
                            strtime = ts[0]
                        if strtime:
                            index1 = title.index(strtime)
                            title = str(title[0:index1]).replace("","")

                        title = '\n{0},{1}'.format(title, strtime)
                        self.WriteLog(title, filename)
                except Exception as e:
                    print("OS error: {0}".format(e))


        except Exception as e1:
            error = "ex"


# python "C:\Program Files (x86)\JetBrains\PyCharm 2016.2.3\helpers\pydev\setup_cython.py" build_ext --inplace


obj = consumer()

obj.CatchData()
# obj.CatchContent('')
# obj.export('')
View Code
相關文章
相關標籤/搜索