from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.webdriver import ActionChains import time import json import os LG_URL_Login = "https://passport.lagou.com/login/login.html" cookies_path = "./cookies.json" class MyException(Exception): def __init__(self, status, msg): self.status = status self.msg = msg class LaGou: def __init__(self): self.login_status = False self.browser = None self.__init_browser() def __init_browser(self): '''初始化瀏覽器配置''' options = Options() options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) self.browser = webdriver.Chrome(options=options) self.browser.maximize_window() self.browser.implicitly_wait(3) self.wait = WebDriverWait(self.browser, 10) self.ac = ActionChains(self.browser) self.browser.get(LG_URL_Login) def __choose_login_mode(self): '''經過用戶名,密碼去登錄''' # 雖然默認是用戶名密碼登錄去,確保無誤,仍是本身點擊一下 self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@data-lg-tj-id='1Us0']"))) self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1Us0']").click() def __input_user_pwd(self, username, password): self.wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="請輸入密碼"]'))) self.wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="請輸入經常使用手機號/郵箱"]'))) if not username: username = input("請輸入經常使用手機號/郵箱>>:").strip() if not password: password = input("請輸入密碼>>:").strip() phone_ele = self.browser.find_element_by_xpath('//input[@placeholder="請輸入經常使用手機號/郵箱"]') pwd_ele = self.browser.find_element_by_xpath('//input[@placeholder="請輸入密碼"]') # 輸入帳號 phone_ele.clear() phone_ele.send_keys(username) # 輸入密碼 pwd_ele.clear() pwd_ele.send_keys(password) def __chick_submit(self): self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@data-lg-tj-id='1j90']"))) self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1j90']").click() def __judge_login_successful(self): '''判斷是否登錄成功''' # 判斷class屬性 user_dropdown try: self.browser.find_element_by_xpath("//*[@class='user_dropdown']") return True except NoSuchElementException: return False def __pull_down_page(self): '''首先拉鉤它是沒有懶加載的,因此咱們只須要下拉一次就行了,因此while循環能夠註釋掉''' height = self.browser.execute_script("return document.body.scrollHeight;") js = "window.scrollTo(0, {});".format(height) self.browser.execute_script(js) return self.browser.page_source # while True: # now_height = self.browser.execute_script("return document.body.scrollHeight;") # if height == now_height: # return self.browser.page_source # js = "window.scrollTo({}, {});".format(height, now_height) # self.browser.execute_script(js) # height = now_height def __judge_ele_exist_by_xpath(self, xpath): '''經過xpath,判斷是否存在這個元素''' try: self.browser.find_element_by_xpath(xpath) return True except NoSuchElementException: return False def __click_next_page(self): '''點擊下一頁''' self.wait.until(EC.presence_of_element_located((By.XPATH, "//span[@class='pager_next ']"))) self.browser.find_element_by_xpath("//span[@class='pager_next ']").click() def __search_job(self, job_name): '''輸入查詢的job信息''' # 首先在搜索職位以前呢,會彈回一個框框,默認選擇全國站, # 之因此會有這個框框,那是由於你不是在登錄狀態下訪問這個url,若是是登錄的,那麼不會出現 try: self.browser.find_element_by_link_text("全國站").click() except NoSuchElementException: pass # 搜索職位 try: # self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='search_input']"))) search_ele = self.browser.find_element_by_xpath("//*[@id='search_input']") except NoSuchElementException: search_ele = self.browser.find_element_by_xpath("//*[@id='keyword']") search_ele.click() search_ele.clear() search_ele.send_keys(job_name, Keys.ENTER) def __del__(self): # 10秒以後,關閉一些資源 self.browser.close() def login(self, username: str = None, password: str = None, load_cookies: bool = True): if load_cookies and os.path.exists(cookies_path): # 使用保存再文件中的cookies去訪問頁面 with open(cookies_path, "r", encoding="utf-8") as f: cookies = json.loads(f.read()) # 將cookies添加進去 for cookie in cookies: self.browser.add_cookie(cookie) # 訪問登錄頁面,若是是登錄頁面表示cookie失效了,cookies沒有的失效的狀況就是重定向到首頁 self.browser.get(LG_URL_Login) if self.__judge_login_successful(): print("登錄成功....") return True else: print("cookies已經失效....") # 刪除剛剛添加的cookies self.browser.delete_all_cookies() self.browser.refresh() self.__choose_login_mode() self.__input_user_pwd(username, password) self.__chick_submit() # 判斷是否有極驗驗證碼 # 若是你多試幾回,你會發現某次登錄不須要滑動驗證碼,因此說咱們就利用這個,雖然並無徹底解決破解,可是目的最終仍是達到了 while True: time.sleep(1) if self.__judge_ele_exist_by_xpath("//div[@class='geetest_panel_box geetest_panelshowslide']"): self.browser.find_element_by_xpath("//a[@class='geetest_close']").click() time.sleep(1) self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1j90']").click() continue else: break if self.__judge_login_successful(): self.login_status = True # 登錄成功,將cookies保存起來 with open("./cookies.json", "w", encoding="utf-8") as f: f.write(json.dumps(self.browser.get_cookies())) print("登錄成功") return True else: print("登錄失敗,請檢查你的用戶名或密碼") return False def get_job_info(self, job_name: str = None, is_filter: bool = False): '''用於獲取到查詢的job''' if not self.login_status: self.browser.get("https://www.lagou.com/") if not job_name: job_name = input("請輸入查詢job的名稱>>:").strip() self.__search_job(job_name) if is_filter: # 過濾這個功能不忙實現 pass # 這裏開始就是進行翻頁操做了,以及對數據的處理 page = 1 while True: print("爬取工做職位爲>>{} 第{}頁數據".format(job_name, page)) # 獲取到完畢的頁面源碼,而後進行提取信息的操做 page_source = self.__pull_down_page() print(page_source) # 信息提取完畢,進行翻頁操縱 if not self.__judge_ele_exist_by_xpath("//span[@class='pager_next ']"): print("{} 工做職位爬取完畢...".format(job_name)) break self.__click_next_page() time.sleep(2) # 點擊完畢下一頁,可能遇到一些反扒措施 if self.browser.current_url == "https://passport.lagou.com/login/login.html": self.login() page += 1 if __name__ == '__main__': lagou = LaGou() username = "" password = "" lagou.login()
保存的cookies只能適用於本次瀏覽器訪問,你關閉瀏覽器後,再使用cookies登錄,會顯示失效。html
但我手動登錄拉鉤,關閉瀏覽器。再次訪問仍是可以訪問到我本身的信息。cookies是沒有失效的,那估計就是我設置cookies那裏有問題吧。web