python3爬蟲-經過selenium登錄拉鉤，爬取職位信息

時間 2019-12-09
標籤 python3 python 爬蟲經過 selenium 登錄拉鉤職位信息欄目 Python 简体版
原文原文鏈接
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
import time
import json
import os

LG_URL_Login = "https://passport.lagou.com/login/login.html"
cookies_path = "./cookies.json"


class MyException(Exception):
    def __init__(self, status, msg):
        self.status = status
        self.msg = msg


class LaGou:
    def __init__(self):
        self.login_status = False
        self.browser = None
        self.__init_browser()

    def __init_browser(self):
        '''初始化瀏覽器配置'''
        options = Options()
        options.add_experimental_option('excludeSwitches', ['enable-automation'])
        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        self.browser = webdriver.Chrome(options=options)
        self.browser.maximize_window()
        self.browser.implicitly_wait(3)
        self.wait = WebDriverWait(self.browser, 10)
        self.ac = ActionChains(self.browser)
        self.browser.get(LG_URL_Login)

    def __choose_login_mode(self):
        '''經過用戶名，密碼去登錄'''
        # 雖然默認是用戶名密碼登錄去，確保無誤，仍是本身點擊一下
        self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@data-lg-tj-id='1Us0']")))
        self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1Us0']").click()

    def __input_user_pwd(self, username, password):
        self.wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="請輸入密碼"]')))
        self.wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="請輸入經常使用手機號/郵箱"]')))
        if not username:
            username = input("請輸入經常使用手機號/郵箱>>:").strip()
        if not password:
            password = input("請輸入密碼>>:").strip()
        phone_ele = self.browser.find_element_by_xpath('//input[@placeholder="請輸入經常使用手機號/郵箱"]')
        pwd_ele = self.browser.find_element_by_xpath('//input[@placeholder="請輸入密碼"]')
        # 輸入帳號
        phone_ele.clear()
        phone_ele.send_keys(username)
        # 輸入密碼
        pwd_ele.clear()
        pwd_ele.send_keys(password)

    def __chick_submit(self):
        self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@data-lg-tj-id='1j90']")))
        self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1j90']").click()

    def __judge_login_successful(self):
        '''判斷是否登錄成功'''
        # 判斷class屬性 user_dropdown
        try:
            self.browser.find_element_by_xpath("//*[@class='user_dropdown']")
            return True
        except NoSuchElementException:
            return False

    def __pull_down_page(self):
        '''首先拉鉤它是沒有懶加載的，因此咱們只須要下拉一次就行了，因此while循環能夠註釋掉'''
        height = self.browser.execute_script("return document.body.scrollHeight;")
        js = "window.scrollTo(0, {});".format(height)
        self.browser.execute_script(js)
        return self.browser.page_source
        # while True:
        #     now_height = self.browser.execute_script("return document.body.scrollHeight;")
        #     if height == now_height:
        #         return self.browser.page_source
        #     js = "window.scrollTo({}, {});".format(height, now_height)
        #     self.browser.execute_script(js)
        #     height = now_height

    def __judge_ele_exist_by_xpath(self, xpath):
        '''經過xpath，判斷是否存在這個元素'''
        try:
            self.browser.find_element_by_xpath(xpath)
            return True
        except NoSuchElementException:
            return False

    def __click_next_page(self):
        '''點擊下一頁'''
        self.wait.until(EC.presence_of_element_located((By.XPATH, "//span[@class='pager_next ']")))
        self.browser.find_element_by_xpath("//span[@class='pager_next ']").click()

    def __search_job(self, job_name):
        '''輸入查詢的job信息'''

        # 首先在搜索職位以前呢，會彈回一個框框，默認選擇全國站，
        # 之因此會有這個框框，那是由於你不是在登錄狀態下訪問這個url，若是是登錄的，那麼不會出現
        try:
            self.browser.find_element_by_link_text("全國站").click()
        except NoSuchElementException:
            pass

        # 搜索職位
        try:
            # self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='search_input']")))
            search_ele = self.browser.find_element_by_xpath("//*[@id='search_input']")
        except NoSuchElementException:
            search_ele = self.browser.find_element_by_xpath("//*[@id='keyword']")
        search_ele.click()
        search_ele.clear()
        search_ele.send_keys(job_name, Keys.ENTER)

    def __del__(self):
        # 10秒以後，關閉一些資源
        self.browser.close()

    def login(self, username: str = None, password: str = None, load_cookies: bool = True):

        if load_cookies and os.path.exists(cookies_path):
            # 使用保存再文件中的cookies去訪問頁面
            with open(cookies_path, "r", encoding="utf-8") as f:
                cookies = json.loads(f.read())

            # 將cookies添加進去
            for cookie in cookies:
                self.browser.add_cookie(cookie)

            # 訪問登錄頁面，若是是登錄頁面表示cookie失效了，cookies沒有的失效的狀況就是重定向到首頁
            self.browser.get(LG_URL_Login)
            if self.__judge_login_successful():
                print("登錄成功....")
                return True
            else:
                print("cookies已經失效....")
                # 刪除剛剛添加的cookies
                self.browser.delete_all_cookies()
        self.browser.refresh()
        self.__choose_login_mode()
        self.__input_user_pwd(username, password)
        self.__chick_submit()

        # 判斷是否有極驗驗證碼
        # 若是你多試幾回，你會發現某次登錄不須要滑動驗證碼，因此說咱們就利用這個，雖然並無徹底解決破解，可是目的最終仍是達到了
        while True:
            time.sleep(1)
            if self.__judge_ele_exist_by_xpath("//div[@class='geetest_panel_box geetest_panelshowslide']"):
                self.browser.find_element_by_xpath("//a[@class='geetest_close']").click()
                time.sleep(1)
                self.browser.find_element_by_xpath("//*[@data-lg-tj-id='1j90']").click()
                continue
            else:
                break

        if self.__judge_login_successful():
            self.login_status = True
            # 登錄成功，將cookies保存起來
            with open("./cookies.json", "w", encoding="utf-8") as f:
                f.write(json.dumps(self.browser.get_cookies()))
            print("登錄成功")
            return True
        else:
            print("登錄失敗，請檢查你的用戶名或密碼")
            return False

    def get_job_info(self, job_name: str = None, is_filter: bool = False):
        '''用於獲取到查詢的job'''
        if not self.login_status:
            self.browser.get("https://www.lagou.com/")
        if not job_name:
            job_name = input("請輸入查詢job的名稱>>:").strip()

        self.__search_job(job_name)

        if is_filter:
            # 過濾這個功能不忙實現
            pass
        # 這裏開始就是進行翻頁操做了，以及對數據的處理
        page = 1
        while True:
            print("爬取工做職位爲>>{}   第{}頁數據".format(job_name, page))
            # 獲取到完畢的頁面源碼，而後進行提取信息的操做
            page_source = self.__pull_down_page()
            print(page_source)
            # 信息提取完畢，進行翻頁操縱
            if not self.__judge_ele_exist_by_xpath("//span[@class='pager_next ']"):
                print("{} 工做職位爬取完畢...".format(job_name))
                break
            self.__click_next_page()
            time.sleep(2)

            # 點擊完畢下一頁，可能遇到一些反扒措施
            if self.browser.current_url == "https://passport.lagou.com/login/login.html":
                self.login()

            page += 1


if __name__ == '__main__':
    lagou = LaGou()

    username = ""
    password = ""
    lagou.login()
保存的cookies只能適用於本次瀏覽器訪問，你關閉瀏覽器後，再使用cookies登錄，會顯示失效。html
但我手動登錄拉鉤，關閉瀏覽器。再次訪問仍是可以訪問到我本身的信息。cookies是沒有失效的，那估計就是我設置cookies那裏有問題吧。web
相關標籤/搜索
每日一句
每一个你不满意的现在，都有一个你没有努力的曾经。