for循環和多線程 + selenium
實例一
for循環
# -*- coding: utf-8 -*- """ Datetime: 2019/6/22 Author: Zhang Yafei Description: """ import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from concurrent.futures import ThreadPoolExecutor import functools chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument('--disable-gpu') def timeit(func): """ 裝飾器: 判斷函數執行時間 :param func: :return: """ @functools.wraps(func) def inner(*args, **kwargs): start = time.time() ret = func(*args, **kwargs) end = time.time() - start if end < 60: print(f'花費時間:\t{round(end, 2)}秒') else: min, sec = divmod(end, 60) print(f'花費時間\t{round(min)}分\t{round(sec, 2)}秒') return ret return inner class PolicyUrlDownload(object): """ 政策數據下載 """ def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True): self.url_list = [url.format(page) for page in range(1, pages_num+1)] self.output_file = output_file self.a_xpath = a_xpath if headless: self.driver = webdriver.Chrome(options=chrome_options) else: self.driver = webdriver.Chrome() def start(self, page, url): with open(self.output_file, mode='a', encoding='utf-8') as file: print(f"make request to {url}") self.driver.get(url) titles = self.driver.find_elements_by_xpath(self.a_xpath) for title in titles: href = title.get_attribute('href') file.write(f'{page}\t{href}\n') print(f'{url} download completed') def run(self): for page, url in enumerate(self.url_list): self.start(page+1, url) self.driver.close() @timeit def main(setting): policy_data = PolicyUrlDownload(**setting) policy_data.run() if __name__ == '__main__': start_time = time.time() print('######################## 開始下載 #########################') # 多配置頁面地址下載 settings = [ { 'output_file': '藥品供應保障綜合的管理.txt', 'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html', 'pages_num': 8, 'a_xpath': '//div[@id="active0"]/ul/li/a' }, { 'output_file': '藥品供應保障綜合的管理.txt', 'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html', 'pages_num': 9, 'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a' } ] for setting in settings: main(setting) print('下載成功, 共花費時間 ', round(time.time() - start_time, 2), '秒')
結果html
下載成功, 共花費時間 28.46 秒
多線程
# -*- coding: utf-8 -*- """ Datetime: 2019/6/22 Author: Zhang Yafei Description: """ import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from concurrent.futures import ThreadPoolExecutor import functools chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument('--disable-gpu') def timeit(func): """ 裝飾器: 判斷函數執行時間 :param func: :return: """ @functools.wraps(func) def inner(*args, **kwargs): start = time.time() ret = func(*args, **kwargs) end = time.time() - start if end < 60: print(f'花費時間:\t{round(end, 2)}秒') else: min, sec = divmod(end, 60) print(f'花費時間\t{round(min)}分\t{round(sec, 2)}秒') return ret return inner class PolicyUrlDownload(object): """ 政策數據下載 """ def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True): self.url_list = [url.format(page) for page in range(1, pages_num+1)] self.output_file = output_file self.a_xpath = a_xpath if headless: self.driver = webdriver.Chrome(options=chrome_options) else: self.driver = webdriver.Chrome() def start(self, page, url): with open(self.output_file, mode='a', encoding='utf-8') as file: print(f"make request to {url}") self.driver.get(url) titles = self.driver.find_elements_by_xpath(self.a_xpath) for title in titles: href = title.get_attribute('href') file.write(f'{page}\t{href}\n') print(f'{url} download completed') def run(self): for page, url in enumerate(self.url_list): self.start(page+1, url) self.driver.close() @timeit def main(setting): policy_data = PolicyUrlDownload(**setting) policy_data.run() if __name__ == '__main__': start_time = time.time() print('######################## 開始下載 #########################') # 多配置頁面地址下載 settings = [ { 'output_file': '藥品供應保障綜合的管理.txt', 'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html', 'pages_num': 8, 'a_xpath': '//div[@id="active0"]/ul/li/a' }, { 'output_file': '藥品供應保障綜合的管理.txt', 'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html', 'pages_num': 9, 'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a' } ] with ThreadPoolExecutor() as pool: pool.map(main, settings) print('下載成功, 共花費時間 ', round(time.time() - start_time, 2), '秒')
結果python
花費時間: 18.04秒
實例二
順序執行
# -*- coding: utf-8 -*- import os import time from concurrent.futures import ThreadPoolExecutor from hashlib import md5 from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service import numpy as np class PolicyPageDownload(object): """ 政策數據下載 """ def __init__(self, file, dir_name, url_list): self.file = file self.dir_name = dir_name self.urls = url_list self.chrome_options = Options() self.chrome_options.add_argument("--headless") self.chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(options=self.chrome_options) # self.driver = webdriver.Chrome() def start(self, url): """ 開始下載 :param url: :return: """ self.driver.get(url) response = self.driver.page_source print(f'make request to {url}') file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' print('11111111111') with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file: file.write(response) print(f'{url} download completed') def run(self): """ 入口函數 """ [self.start(url) for url in self.urls] self.driver.quit() def filter_urls(dir_name, urls): """ 過濾url :param urls: :return: """ encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls] has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0] encode_urls = set(encode_urls) - set(has_file) down_urls = list( filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls)) print(f'共{len(set(urls))}\t已下載{len(set(has_file))}\t 還需下載{len(encode_urls)}') return down_urls def run(url_list): policy = PolicyPageDownload(url_list=url_list, **setting) policy.run() def main(file, dir_name): if not os.path.exists(dir_name): os.mkdir(dir_name) inputfile = open(file, 'r', encoding='utf-8') urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)] if os.path.exists(dir_name): urls = filter_urls(dir_name, urls) run(urls) if __name__ == '__main__': start_time = time.time() setting = { 'file': '藥品供應保障綜合的管理.txt', 'dir_name': '藥品供應保障綜合的管理' } main(**setting) print('下載成功, 共花費時間 ', round(time.time() - start_time, 2), '秒')
多線程
# -*- coding: utf-8 -*- import os import time from concurrent.futures import ThreadPoolExecutor from hashlib import md5 from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service import numpy as np class PolicyPageDownload(object): """ 政策數據下載 """ def __init__(self, file, dir_name, url_list): self.file = file self.dir_name = dir_name self.urls = url_list self.chrome_options = Options() self.chrome_options.add_argument("--headless") self.chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(options=self.chrome_options) # self.driver = webdriver.Chrome() def start(self, url): """ 開始下載 :param url: :return: """ self.driver.get(url) response = self.driver.page_source print(f'make request to {url}') file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' print('11111111111') with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file: file.write(response) print(f'{url} download completed') def run(self): """ 入口函數 """ [self.start(url) for url in self.urls] self.driver.quit() def filter_urls(dir_name, urls): """ 過濾url :param urls: :return: """ encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls] has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0] encode_urls = set(encode_urls) - set(has_file) down_urls = list( filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls)) print(f'共{len(set(urls))}\t已下載{len(set(has_file))}\t 還需下載{len(encode_urls)}') return down_urls def run(url_list): policy = PolicyPageDownload(url_list=url_list, **setting) policy.run() def main(file, dir_name): if not os.path.exists(dir_name): os.mkdir(dir_name) inputfile = open(file, 'r', encoding='utf-8') urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)] if os.path.exists(dir_name): urls = filter_urls(dir_name, urls) with ThreadPoolExecutor() as pool: pool.map(run, np.array_split(urls, 4)) if __name__ == '__main__': start_time = time.time() setting = { 'file': '藥品供應保障綜合的管理.txt', 'dir_name': '藥品供應保障綜合的管理' } main(**setting) print('下載成功, 共花費時間 ', round(time.time() - start_time, 2), '秒')
運行結果web
# 50 for循環: 下載成功, 共花費時間 48.62 秒 # 150 for循環: 共花費時間 150.22 秒 # 150 多線程: 共花費時間 80.84 秒
- 結論: 創建driver的花銷較大,儘可能建立一次,屢次使用, 併發的話不能共用一個driver,必須從新建立
- 使用技巧總結:建立多個線程,個數最好和cpu個數相同,每一個線程建立一個driver