python3爬蟲-經過selenium登錄拉鉤,爬取職位信息

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
import time
import json
import os

LG_URL_Login = "https://passport.lagou.com/login/login.html"
cookies_path = "./cookies.json"


class MyException(Exception):
    def __init__(self, status, msg):
        self.status = status
        self.msg = msg


class LaGou:
    def __init__(self):
        self.login_status = False
        self.browser = None
        self.__init_browser()

    def __init_browser(self):
        '''初始化瀏覽器配置'''
        options = Options()
        options.add_experimental_option('excludeSwitches', ['enable-automation'])
        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        self.browser = webdriver.Chrome(options=options)
        self.browser.maximize_window()
        self.browser.implicitly_wait(3)
        self.wait = WebDriverWait(self.browser, 10)
        self.ac = ActionChains(self.browser)
        self.browser.get(LG_URL_Login)

    def __choose_login_mode(self):
        '''經過用戶名,密碼去登錄'''
        # 雖然默認是用戶名密碼登錄去,確保無誤,仍是本身點擊一下
        self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@data-lg-tj-id='1Us0']")))
        self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1Us0']").click()

    def __input_user_pwd(self, username, password):
        self.wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="請輸入密碼"]')))
        self.wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="請輸入經常使用手機號/郵箱"]')))
        if not username:
            username = input("請輸入經常使用手機號/郵箱>>:").strip()
        if not password:
            password = input("請輸入密碼>>:").strip()
        phone_ele = self.browser.find_element_by_xpath('//input[@placeholder="請輸入經常使用手機號/郵箱"]')
        pwd_ele = self.browser.find_element_by_xpath('//input[@placeholder="請輸入密碼"]')
        # 輸入帳號
        phone_ele.clear()
        phone_ele.send_keys(username)
        # 輸入密碼
        pwd_ele.clear()
        pwd_ele.send_keys(password)

    def __chick_submit(self):
        self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@data-lg-tj-id='1j90']")))
        self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1j90']").click()

    def __judge_login_successful(self):
        '''判斷是否登錄成功'''
        # 判斷class屬性 user_dropdown
        try:
            self.browser.find_element_by_xpath("//*[@class='user_dropdown']")
            return True
        except NoSuchElementException:
            return False

    def __pull_down_page(self):
        '''首先拉鉤它是沒有懶加載的,因此咱們只須要下拉一次就行了,因此while循環能夠註釋掉'''
        height = self.browser.execute_script("return document.body.scrollHeight;")
        js = "window.scrollTo(0, {});".format(height)
        self.browser.execute_script(js)
        return self.browser.page_source
        # while True:
        #     now_height = self.browser.execute_script("return document.body.scrollHeight;")
        #     if height == now_height:
        #         return self.browser.page_source
        #     js = "window.scrollTo({}, {});".format(height, now_height)
        #     self.browser.execute_script(js)
        #     height = now_height

    def __judge_ele_exist_by_xpath(self, xpath):
        '''經過xpath,判斷是否存在這個元素'''
        try:
            self.browser.find_element_by_xpath(xpath)
            return True
        except NoSuchElementException:
            return False

    def __click_next_page(self):
        '''點擊下一頁'''
        self.wait.until(EC.presence_of_element_located((By.XPATH, "//span[@class='pager_next ']")))
        self.browser.find_element_by_xpath("//span[@class='pager_next ']").click()

    def __search_job(self, job_name):
        '''輸入查詢的job信息'''

        # 首先在搜索職位以前呢,會彈回一個框框,默認選擇全國站,
        # 之因此會有這個框框,那是由於你不是在登錄狀態下訪問這個url,若是是登錄的,那麼不會出現
        try:
            self.browser.find_element_by_link_text("全國站").click()
        except NoSuchElementException:
            pass

        # 搜索職位
        try:
            # self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='search_input']")))
            search_ele = self.browser.find_element_by_xpath("//*[@id='search_input']")
        except NoSuchElementException:
            search_ele = self.browser.find_element_by_xpath("//*[@id='keyword']")
        search_ele.click()
        search_ele.clear()
        search_ele.send_keys(job_name, Keys.ENTER)

    def __del__(self):
        # 10秒以後,關閉一些資源
        self.browser.close()

    def login(self, username: str = None, password: str = None, load_cookies: bool = True):

        if load_cookies and os.path.exists(cookies_path):
            # 使用保存再文件中的cookies去訪問頁面
            with open(cookies_path, "r", encoding="utf-8") as f:
                cookies = json.loads(f.read())

            # 將cookies添加進去
            for cookie in cookies:
                self.browser.add_cookie(cookie)

            # 訪問登錄頁面,若是是登錄頁面表示cookie失效了,cookies沒有的失效的狀況就是重定向到首頁
            self.browser.get(LG_URL_Login)
            if self.__judge_login_successful():
                print("登錄成功....")
                return True
            else:
                print("cookies已經失效....")
                # 刪除剛剛添加的cookies
                self.browser.delete_all_cookies()
        self.browser.refresh()
        self.__choose_login_mode()
        self.__input_user_pwd(username, password)
        self.__chick_submit()

        # 判斷是否有極驗驗證碼
        # 若是你多試幾回,你會發現某次登錄不須要滑動驗證碼,因此說咱們就利用這個,雖然並無徹底解決破解,可是目的最終仍是達到了
        while True:
            time.sleep(1)
            if self.__judge_ele_exist_by_xpath("//div[@class='geetest_panel_box geetest_panelshowslide']"):
                self.browser.find_element_by_xpath("//a[@class='geetest_close']").click()
                time.sleep(1)
                self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1j90']").click()
                continue
            else:
                break

        if self.__judge_login_successful():
            self.login_status = True
            # 登錄成功,將cookies保存起來
            with open("./cookies.json", "w", encoding="utf-8") as f:
                f.write(json.dumps(self.browser.get_cookies()))
            print("登錄成功")
            return True
        else:
            print("登錄失敗,請檢查你的用戶名或密碼")
            return False

    def get_job_info(self, job_name: str = None, is_filter: bool = False):
        '''用於獲取到查詢的job'''
        if not self.login_status:
            self.browser.get("https://www.lagou.com/")
        if not job_name:
            job_name = input("請輸入查詢job的名稱>>:").strip()

        self.__search_job(job_name)

        if is_filter:
            # 過濾這個功能不忙實現
            pass
        # 這裏開始就是進行翻頁操做了,以及對數據的處理
        page = 1
        while True:
            print("爬取工做職位爲>>{}   第{}頁數據".format(job_name, page))
            # 獲取到完畢的頁面源碼,而後進行提取信息的操做
            page_source = self.__pull_down_page()
            print(page_source)
            # 信息提取完畢,進行翻頁操縱
            if not self.__judge_ele_exist_by_xpath("//span[@class='pager_next ']"):
                print("{} 工做職位爬取完畢...".format(job_name))
                break
            self.__click_next_page()
            time.sleep(2)

            # 點擊完畢下一頁,可能遇到一些反扒措施
            if self.browser.current_url == "https://passport.lagou.com/login/login.html":
                self.login()

            page += 1


if __name__ == '__main__':
    lagou = LaGou()

    username = ""
    password = ""
    lagou.login()

 

保存的cookies只能適用於本次瀏覽器訪問,你關閉瀏覽器後,再使用cookies登錄,會顯示失效。html

但我手動登錄拉鉤,關閉瀏覽器。再次訪問仍是可以訪問到我本身的信息。cookies是沒有失效的,那估計就是我設置cookies那裏有問題吧。web

相關文章
相關標籤/搜索