selenium對51job進行職位爬蟲

selenium 爬蟲流程以下:
  一、對某職位進行爬蟲 ---如:自動化測試
  二、用到IDE爲 pycharm
  三、爬蟲職位導入到MongoDB數據庫中
  四、在線安裝 pip install pymongo
  五、本次使用到腳本化無頭瀏覽器 --- PhantomJS

MongoDB安裝說明鏈接:https://www.twblogs.net/a/5c27009bbd9eee16b3dba7bc/zh-cn
PhantomJS 下載地址和API鏈接:http://phantomjs.org/download.htmlhttp://phantomjs.org/api/

下載後添加path中 --- CMD窗口輸入 PhantomJS 按回車 --- 出現 phantomjs> 說明配置成功

以下爲 51job.py 截圖:

 

 

 

config配置文件以下:css

 

pycharm 運行結果:html

 

MongoDB 數據庫截圖:web

 

 以下爲config 配置文件:mongodb


MONGO_URL = "mongodb://127.0.0.1:27017/"

MONGO_DB = "自動化測試"
MONGO_TABLE = "自動化測試工程師"

KEYWORD = "自動化測試工程師"

SERVICE_ARGS = ["--load-images=false","--disk-cache=true"] #忽略緩存和圖片加載


爬蟲源碼以下:
from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport time as tfrom pyquery import PyQuery as pqimport re#config -- 上面已展現from Selenium_test.config import *import pymongo#對MonGoDB進行實例化操做client = pymongo.MongoClient(MONGO_URL)#鏈接MongoDB數據庫db = client[MONGO_DB]#瀏覽器實例化driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)# driver = webdriver.Chrome()driver.set_window_size(1400,900)driver.maximize_window()driver.implicitly_wait(10)#顯示等待wait = WebDriverWait(driver,10)def search():    print("正在搜索")    try:        driver.get("https://www.51job.com/")        element = wait.until(            EC.presence_of_element_located((By.ID,"kwdselectid"))        )        element.send_keys(KEYWORD)        # 取消選中城市        driver.find_element_by_id("work_position_input").click()        t.sleep(2)        selectedCityEles = driver.find_elements_by_css_selector(            "#work_position_click_center_right_list_000000 em[class=on]"        )        for one in selectedCityEles:            # print(one.text)            one.click()        # 選中城市        t.sleep(2)        driver.find_element_by_id("work_position_click_center_right_list_category_000000_040000").click()        t.sleep(2)        driver.find_element_by_id("work_position_click_bottom_save").click()        #點擊搜索        t.sleep(2)        driver.find_element_by_css_selector("body > div.content > div > div.fltr.radius_5 > div > button").click()        # submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.dw_wp > form > div > div.dw_search_in > button")))        # submit.click()        #總計頁數        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#resultList > div.dw_page > div > div > div > span:nth-child(2)")))        # get_products()        return total.text    except Exception:        return search()def next_page(page_number):    print("正在翻頁",page_number)    try:        element = wait.until(            EC.presence_of_element_located((By.CSS_SELECTOR, "#jump_page"))        )        element.clear()        element.send_keys(page_number)        submit = wait.until(            EC.element_to_be_clickable((By.CSS_SELECTOR, "#resultList > div.dw_page > div > div > div > span.og_but")))        submit.click()        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#resultList > div.dw_page > div > div > div > ul > li.on"),str(page_number)))        get_products()    except Exception:        next_page(page_number)def get_products():    # jobs = driver.find_elements_by_css_selector("#resultList div[class=el]")    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#resultList div[class=el]")))    html = driver.page_source    doc = pq(html)    items = doc("#resultList div[class=el]").items()    for item in items:        product = {            "職位":item.find(".t1").text(),            "公司":item.find(".t2").text(),            "工做點":item.find(".t3").text(),            "薪資":item.find(".t4").text(),            "發佈時間":item.find(".t5").text()        }        print(product)        save_to_mongo(product)def save_to_mongo(result):    try:        if db[MONGO_TABLE].insert(result):            print("存儲到MONGODB成功",result)    except Exception:        print("存儲到MONGODB失敗",result)def main():    try:        total = search()        total = int(re.compile("(\d+)").search(total).group(1))        for i in range(2,total + 1):            next_page(i)    except Exception:        print("出錯")    finally:        driver.quit()if __name__ == '__main__':    main()
相關文章
相關標籤/搜索