1,動態數據加載的處理html
2,seleniumweb
from selenium import webdriver
from time import sleep # 後面是你的瀏覽器驅動位置,記得前面加r'','r'是防止字符轉義的 driver = webdriver.Chrome(r'驅動程序路徑') # 用get打開百度頁面 driver.get("http://www.baidu.com") # 查找頁面的「設置」選項,並進行點擊 driver.find_elements_by_link_text('設置')[0].click() sleep(2) # # 打開設置後找到「搜索設置」選項,設置爲每頁顯示50條 driver.find_elements_by_link_text('搜索設置')[0].click() sleep(2) # 選中每頁顯示50條 m = driver.find_element_by_id('nr') sleep(2) m.find_element_by_xpath('//*[@id="nr"]/option[3]').click() m.find_element_by_xpath('.//option[3]').click() sleep(2) # 點擊保存設置 driver.find_elements_by_class_name("prefpanelgo")[0].click() sleep(2) # 處理彈出的警告頁面 肯定accept() 和 取消dismiss() driver.switch_to_alert().accept() sleep(2) # 找到百度的輸入框,並輸入 美女 driver.find_element_by_id('kw').send_keys('美女') sleep(2) # 點擊搜索按鈕 driver.find_element_by_id('su').click() sleep(2) # 在打開的頁面中找到「Selenium - 開源中國社區」,並打開這個頁面 driver.find_elements_by_link_text('美女_百度圖片')[0].click() sleep(3) # 關閉瀏覽器 driver.quit()
代碼詳解:chrome
3,phantomJSwindows
from selenium import webdriver
import time bro = webdriver.PhantomJS(executable_path="D:\PhantomJS\phantomjs-2.1.1-windows\bin\phantomjs.exe") # 請求的發送 bro.get(url="https://www.baidu.com") # 截圖 bro.save_screenshot("./1.jpg") # 根據find系列的函數定位到指定的標籤 my_input = bro.find_element_by_id("kw") # 向標籤中錄入指定的標籤 my_input = bro.send_keys("美女") # 知道百度一下的按鈕 my_button = bro.find_element_by_id("su") my_button.click() # 獲取瀏覽器當前的頁面源碼 page_text = bro.page_source bro.save_screenshot("./2.png") # 截圖 print(page_text) bro.quit()
from selenium import webdriver
import time # 假裝一個谷歌瀏覽器(實例化一個谷歌瀏覽器的對象) bro = webdriver.Chrome(executable_path=r"D:\chrome\chromedriver.exe") # 發送的請求 bro.get(url="https://www.baidu.com") time.sleep(3) # 根據find系列的函數定位到指定的標籤(這個是輸入框的標籤) my_input = bro.find_element_by_id("kw") # 向標籤中錄入指定的數據 my_input.send_keys("美女") time.sleep(3) # 獲取到點擊按鈕(百度一下) my_button = bro.find_element_by_id("su") # 點擊搜索 my_button.click() time.sleep(3) # 獲取到當前瀏覽器顯示的頁面的頁面源碼 page_text = bro.page_source print(page_text) # 退出 bro.quit()
qq空間登陸的代碼:後端
from lxml import etree
bro = webdriver.Chrome(executable_path=r"D:\chrome\chromedriver.exe") url = "https://qzone.qq.com/" # 請求的發送 bro.get(url=url) time.sleep(1) # 定位到指定的iframe bro.switch_to.frame("login_frame") # 找到帳號密碼登陸的標籤 bro.find_element_by_id("switcher_plogin").click() time.sleep(1) # 找到登陸的按鈕,並點擊登陸 # 找到用戶名的輸入框並輸入帳號 username = bro.find_element_by_id("u") username.send_keys("937371049") # 找到密碼並輸入 password = bro.find_element_by_id("p") password.send_keys("13633233754") # 找到登陸按鈕點擊登陸 bro.find_element_by_id("login_button").click() time.sleep(1) # 找到js代碼,滾輪 向下滾動 js = "window.scrollTo(0, document.body.scrollHeight)" # 滾輪開始滾動 bro.execute_script(js) time.sleep(2) bro.execute_script(js) time.sleep(2) bro.execute_script(js) time.sleep(2) bro.execute_script(js) time.sleep(2) page_text = bro.page_source time.sleep(3) # 解析: # 把 獲取到的數據轉化成html的格式 tree = etree.HTML(page_text) div_list = tree.xpath('//div[@class="f-info qz_info_cut"] | //div[@class="f-info"]') for div in div_list: text = div.xpath('.//text()') text = "".join(text) print(text) bro.quit()
4,谷歌 無頭瀏覽器api
from selenium.webdriver.chrome.options import Options
chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") # 谷歌無頭瀏覽器 bro = webdriver.Chrome(executable_path=r"D:\chrome\chromedriver.exe", chrome_options=chrome_options) # 請求的發送 bro.get(url="https://www.baidu.com") # 根據find系列的函數定位到指定的標籤 my_input = bro.find_element_by_id("kw") # 向標籤中錄入指定的元素 my_input.send_keys("美女") my_button = bro.find_element_by_id("su") # 點擊百度 my_button.click() # 獲取當前瀏覽器顯示的頁面的頁面源碼 page_text = bro.page_source print(page_text) bro.quit()
5,UA池 和代理池瀏覽器
UA池:User-Agent池網絡
from scrapy import signals
import random class CrawlproSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class CrawlproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. proxy_http = [ "http://113.128.10.121", "http://49.86.181.235", "http://121.225.52.143", "http://180.118.134.29", "http://111.177.186.27", "http://175.155.77.189", "http://110.52.235.120", "http://113.128.24.189", ] proxy_https = [ "https://93.190.143.59", "https://106.104.168.15", "https://167.249.181.237", "https://124.250.70.76", "https://119.101.115.2", "https://58.55.133.48", "https://49.86.177.193", "https://58.55.132.231", "https://58.55.133.77", "https://119.101.117.189", "https://27.54.248.42", "https://221.239.86.26", ] # 攔截請求:request參數就是攔截到的請求 user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called print("中間件開始下載", request) if request.url.split(":")[0] == "http": request.meta["proxy"] = random.choice(self.proxy_http) else: request.meta["proxy"] = random.choice(self.proxy_https) request.header["User-Agent"] = random.choice(self.user_agent_list) print(request.meta["proxy"], request.heaser["User-Agent"]) return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
代理池框架