import requests from selenium import webdriver from lxml import etree import time class DiffSpider: def __init__(self): self.baseurl = 'https://www.nst.com.my/actionline' # self.baseurl = 'https://v.youku.com/v_show/id_XNDE4MzQzOTA2NA==.html' self.options = webdriver.ChromeOptions() # self.options.add_argument('--headless') # self.options.add_experimental_option('excludeSwitches', ['enable-automation']) self.driver = webdriver.Chrome(options=self.options) self.driver.maximize_window() self.headers ={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 'Accept-Language':'zh-CN,zh;q=0.9', } def gethtml(self): self.driver.get(self.baseurl) # self.driver.get(self.baseurl) time.sleep(2) chrome_html = self.driver.page_source print(chrome_html) self.writeto(chrome_html) def writeto(self, chrome_html): with open('chrome_sourse.html','w', encoding='utf-8')as f: f.write(chrome_html) if __name__ == '__main__': spider = DiffSpider() spider.gethtml()
攔截html
from mitmproxy import ctx def response(flow): if '/www.nst.com.my/actionline' not in flow.request.url : return # for webdriver_key in ['webdriver', '__driver_evaluate', '__webdriver_evaluate', '__selenium_evaluate', # '__fxdriver_evaluate', '__driver_unwrapped', '__webdriver_unwrapped', '__selenium_unwrapped', # '__fxdriver_unwrapped', '_Selenium_IDE_Recorder', '_selenium', 'calledSelenium', # '_WEBDRIVER_ELEM_CACHE', 'ChromeDriverw', 'driver-evaluate', 'webdriver-evaluate', # 'selenium-evaluate', 'webdriverCommand', 'webdriver-evaluate-response', '__webdriverFunc', # '__webdriver_script_fn', '__$webdriverAsyncExecutor', '__lastWatirAlert', # '__lastWatirConfirm', '__lastWatirPrompt', '$chrome_asyncScriptInfo', # '$cdc_asdjflasutopfhvcZLmcfl_']: # ctx.log.info('Remove "{}" from {}.'.format(webdriver_key, flow.request.url)) # flow.response.text = flow.response.text.replace('"{}"'.format(webdriver_key), '"NO-SUCH-ATTR"') flow.response.text = flow.response.text.replace('k60.L70(+"49")','false') flow.response.text = flow.response.text.replace('k60.L70(+"232")', 'user-agent') flow.response.text = flow.response.text.replace('k60.X70(+"489")', '{ "runtime": {}}') flow.response.text = flow.response.text.replace('k60.X70("235" | 0)', '["zh-CN", "zh"]') flow.response.text = flow.response.text.replace('k60.L70(28)', 'false') flow.response.text = flow.response.text.replace('k60.L70("462" - 0)', 'Google Inc') flow.response.text = flow.response.text.replace('U1[k60.L70("303" | 0)]', '100') flow.response.text = flow.response.text.replace('U1[k60.X70("7" * 1)]', '40') 原文連接:https://blog.csdn.net/qq_24137739/article/details/93631569