維護一組瀏覽器,實現每分鐘1000次查詢。DriverPool使用變幻版只初始化一次的單例模式。維護每一個瀏覽器的當前是否使用的狀態。javascript
不須要等待請求來了,臨時開瀏覽器,開一個瀏覽器會耽誤6秒鐘。css
能夠在程序啓動後,隨便使用命令殺死slenium,,不怕被別人殺死,不須要重啓程序就能保證長久正常運行。java
主要使用了 mixin繼承、變化版單例模式、鴨子類、橋接模式、上下文管理器,引入了資源池的概念,自動選擇一個當前未被使用的瀏覽器。web
使用了池固定了瀏覽器最大數量,避免了直接開孤立的slenium driver,當併發大的時候代碼忽然啓動幾百上千個瀏覽器,會致使系統忽然性能衰竭。chrome
# coding=utf8 """ 瀏覽器資源池維護。不須要等待有任務來了,再重開瀏覽器。新開瀏覽器會耽誤6秒時間。 抗殺抗oom,能夠隨便在程序啓動後,批量殺死瀏覽器,程序會自動開啓。 """ import time import os from pathlib import Path from threading import Lock from urllib.error import URLError from selenium.webdriver import DesiredCapabilities from selenium.common.exceptions import WebDriverException from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from app.utils_ydf import LoggerMixin, BoundedThreadPoolExecutor, decorators, LogManager class NoAvailableDriverError(Exception): pass class DriverItem: def __init__(self, driver, ): self.driver = driver self.create_time = time.time() self.is_using = False self.last_use_time = time.time() def __str__(self): # noinspection PyRedundantParentheses return (f"{time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(self.create_time))} {self.is_using} {time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(self.last_use_time))} {self.driver}") class PhantomjsItemBuilder(LoggerMixin): # noinspection PyBroadException def create_a_driver_item(self): t0 = time.time() capabilities = DesiredCapabilities.PHANTOMJS.copy() capabilities['platform'] = "WINDOWS" capabilities['version'] = "10" capabilities['phantomjs.page.settings.loadImages'] = False # capabilities['phantomjs.page.settings.userAgent'] = ( # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " # "Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") capabilities['phantomjs.page.settings.userAgent'] = ( "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36") service_args = ['--load-images=no', '--disk-cache=yes', '--ignore-ssl-errors=true'] self.logger_with_file.info('建立一個driver中。。。。。。') driver = None if os.name == 'posix': # driver = webdriver.PhantomJS(executable_path=Path(__file__).parent / Path('phantomjs'), desired_capabilities=capabilities, service_args=service_args) try: driver = webdriver.PhantomJS(desired_capabilities=capabilities, service_args=service_args) except Exception as e: self.logger.exception(f'從環境變量獲取driver路徑失敗,改成從/usr/local/bin文件夾獲取 {e}') try: driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs', desired_capabilities=capabilities, service_args=service_args) except Exception as e: self.logger.exception(f'從/usr/local/bin/phantomjs啓動失敗 {e}') else: driver = webdriver.PhantomJS(desired_capabilities=capabilities, service_args=service_args) # driver.maximize_window() driver.set_window_size(390, 713) driver.set_page_load_timeout(10) # driver.implicitly_wait(10) self.logger.info(f'建立一個瀏覽器耗時{time.time() - t0}') return DriverItem(driver) class ChromeItemBuilder(LoggerMixin): def create_a_driver_item(self): self.logger.info('建立一個driver中。。。。。。') t0 = time.time() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-images') chrome_options.binary_location = r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe' # prefs = {"profile.managed_default_content_settings.images": 2} prefs = { 'profile.default_content_setting_values': { # 也能夠這樣寫,兩種都正確 # 'profile.default_content_settings': { 'images': 2, # 不加載圖片 'javascript': 1, # 2不加載JS "User-Agent": 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"', # 更換UA } } chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument('blink-settings=imagesEnabled=false') # 這句禁用圖片才能生效,上面兩個禁用圖片沒起到效果。 driver = webdriver.Chrome(chrome_options=chrome_options) # driver.maximize_window() driver.set_window_size(390, 713) driver.set_page_load_timeout(100) driver.implicitly_wait(100) self.logger.info(f'建立一個瀏覽器耗時{time.time() - t0}') return DriverItem(driver) class DriverPool(LoggerMixin): lock = Lock() def __new__(cls, *args, **kwargs): if not hasattr(cls, '_instance'): self = super().__new__(cls, ) cls._instance = self self.__custom_init__(*args, **kwargs) return cls._instance def __custom_init__(self, driver_item_num=10, driver_name=1): """ :param driver_item_num:瀏覽器數量 :param driver_name: 瀏覽器種類 1爲phantomsj,2爲chrome :return: """ self.driver_item_list = list() self._driver_item_num = driver_item_num self.driver_item_builder = PhantomjsItemBuilder() if driver_name == 1 else ChromeItemBuilder() self.logger_with_file.info(f'準備初始化{driver_item_num}個瀏覽器') self._has_init_all_driver_item = False self._init_time = 0 self._init_all_driver_item() def _init_all_driver_item(self): if time.time() - self._init_time > 60: self._init_time = time.time() self.logger.warning('殺死殘留的phantomjs進程') # 此處的命令不用怕誤殺其它地方的phantomjs,上下文管理器使用被殺的瀏覽器會自動啓動。 if os.name == 'posix': os.system('ps -aux|grep phantomjs|grep -v grep|cut -c 9-15|xargs kill -9') else: os.system('taskkill /F /im phantomjs.exe') t0 = time.time() self.driver_item_list.clear() # 必定須要清空原來的。 def _inner(this: DriverPool): driver_item = this.driver_item_builder.create_a_driver_item() this.driver_item_list.append(driver_item) thread_pool = BoundedThreadPoolExecutor(self._driver_item_num) [thread_pool.submit(_inner, self) for _ in range(self._driver_item_num)] # 親測多線程建立10個瀏覽器,比一個接一個的建立速度要快不少。 thread_pool.shutdown() self._has_init_all_drivers = True self.logger.info(f'全部瀏覽器初始化建立成功,耗時 {time.time() - t0}秒 {len(self.driver_item_list)} {self.driver_item_list}') def borrow_a_driver_item(self): with self.lock: current_using_number = 0 current_not_using_number = 0 for driver_item in self.driver_item_list: if driver_item.is_using: current_using_number += 1 else: current_not_using_number += 1 self.logger.debug(f'當前正在使用的瀏覽器數量是{current_using_number},閒置的瀏覽器數量是{current_not_using_number}') for index, driver_item in enumerate(self.driver_item_list): if driver_item.is_using is False: if time.time() - driver_item.create_time > 3600: self.logger.debug('防止phantomjs內存泄漏,關閉並從新建立一個瀏覽器') self.driver_item_list.pop(index) driver_item.driver.quit() driver_item = self.driver_item_builder.create_a_driver_item() self.driver_item_list.insert(index, driver_item) driver_item.is_using = True return driver_item raise NoAvailableDriverError('當前沒有可用的瀏覽器。。。。。。。。。。。。') @staticmethod def give_back_a_driver_item(driver_item: DriverItem): driver_item.is_using = False driver_item.last_use_time = time.time() class DriverContext: def __init__(self): self.driver_pool = DriverPool() self.driver_item = None self.start_using_time = time.time() def __enter__(self): self.driver_item = self.driver_pool.borrow_a_driver_item() self.driver_pool.logger_with_file.debug(f'當前使用的瀏覽器是 {self.driver_item}') return self.driver_item.driver def __exit__(self, exc_type, exc_val, exc_tb): self.driver_pool.logger.info(f'此瀏覽器 {self.driver_item} 佔用時間爲 {time.time() - self.start_using_time}秒') self.driver_pool.give_back_a_driver_item(self.driver_item) if exc_type == URLError: # 若是phantomjs被被手動殺死或者oom了,再次使用這個phatntomjs會出這個URLError錯,從新生成瀏覽器池。 self.driver_pool._init_all_driver_item() if exc_type and issubclass(exc_type, WebDriverException): self.driver_pool.logger.error(f'selenium發生錯誤 ,錯誤類型--> {exc_type} 錯誤緣由--> {exc_val}') # return True if __name__ == '__main__': logger = LogManager('driver_pool_test').get_logger_and_add_handlers() DriverPool(50) if not Path('/picture').exists(): Path('/picture').mkdir() @decorators.tomorrow_threads(40) def f(): with DriverContext() as driver: # 須要使用with語法來使用瀏覽器,不然須要手動額外處理一些問題和維護瀏覽器的使用狀態。 logger.debug(f'使用的瀏覽器是--> {driver}') driver.get('http://m.elong.com/ihotel/283904/?inDate=2018-12-12&outDate=2018-12-13&roomPerson=1|2') driver.save_screenshot(f'/picture/{time.time()}.png') WebDriverWait(driver, 10, 0.2).until( lambda driverx: driverx.find_element_by_css_selector('#detail-mapping-box > li:nth-child(1) > div.prodjh_list_box.clearfix > div.detail-mrooom-mapping-product > div.dprodtname')) logger.info(f'頁面內容長度是: {len(driver.page_source)}') driver.save_screenshot(f'/picture/{time.time()}.png') [(time.sleep(0.1), f()) for _ in range(50000)]
使用如圖,因爲不須要對每次請求都頻繁建立和摧毀瀏覽器,因此打開網頁速度很快。瀏覽器