selenium 爬蟲流程以下:
一、對某職位進行爬蟲 ---如:自動化測試
二、用到IDE爲 pycharm
三、爬蟲職位導入到MongoDB數據庫中
四、在線安裝 pip install pymongo
五、本次使用到腳本化無頭瀏覽器 --- PhantomJS
MongoDB安裝說明鏈接:https://www.twblogs.net/a/5c27009bbd9eee16b3dba7bc/zh-cn
PhantomJS 下載地址和API鏈接:http://phantomjs.org/download.html , http://phantomjs.org/api/
下載後添加path中 --- CMD窗口輸入 PhantomJS 按回車 --- 出現 phantomjs> 說明配置成功
以下爲 51job.py 截圖:
config配置文件以下:css
pycharm 運行結果:html
MongoDB 數據庫截圖:web
以下爲config 配置文件:mongodb
MONGO_URL = "mongodb://127.0.0.1:27017/"
MONGO_DB = "自動化測試"
MONGO_TABLE = "自動化測試工程師"
KEYWORD = "自動化測試工程師"
SERVICE_ARGS = ["--load-images=false","--disk-cache=true"] #忽略緩存和圖片加載
爬蟲源碼以下:
from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport time as tfrom pyquery import PyQuery as pqimport re#config -- 上面已展現from Selenium_test.config import *import pymongo#對MonGoDB進行實例化操做client = pymongo.MongoClient(MONGO_URL)#鏈接MongoDB數據庫db = client[MONGO_DB]#瀏覽器實例化driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)# driver = webdriver.Chrome()driver.set_window_size(1400,900)driver.maximize_window()driver.implicitly_wait(10)#顯示等待wait = WebDriverWait(driver,10)def search(): print("正在搜索") try: driver.get("https://www.51job.com/") element = wait.until( EC.presence_of_element_located((By.ID,"kwdselectid")) ) element.send_keys(KEYWORD) # 取消選中城市 driver.find_element_by_id("work_position_input").click() t.sleep(2) selectedCityEles = driver.find_elements_by_css_selector( "#work_position_click_center_right_list_000000 em[class=on]" ) for one in selectedCityEles: # print(one.text) one.click() # 選中城市 t.sleep(2) driver.find_element_by_id("work_position_click_center_right_list_category_000000_040000").click() t.sleep(2) driver.find_element_by_id("work_position_click_bottom_save").click() #點擊搜索 t.sleep(2) driver.find_element_by_css_selector("body > div.content > div > div.fltr.radius_5 > div > button").click() # submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.dw_wp > form > div > div.dw_search_in > button"))) # submit.click() #總計頁數 total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#resultList > div.dw_page > div > div > div > span:nth-child(2)"))) # get_products() return total.text except Exception: return search()def next_page(page_number): print("正在翻頁",page_number) try: element = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#jump_page")) ) element.clear() element.send_keys(page_number) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#resultList > div.dw_page > div > div > div > span.og_but"))) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#resultList > div.dw_page > div > div > div > ul > li.on"),str(page_number))) get_products() except Exception: next_page(page_number)def get_products(): # jobs = driver.find_elements_by_css_selector("#resultList div[class=el]") wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#resultList div[class=el]"))) html = driver.page_source doc = pq(html) items = doc("#resultList div[class=el]").items() for item in items: product = { "職位":item.find(".t1").text(), "公司":item.find(".t2").text(), "工做點":item.find(".t3").text(), "薪資":item.find(".t4").text(), "發佈時間":item.find(".t5").text() } print(product) save_to_mongo(product)def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print("存儲到MONGODB成功",result) except Exception: print("存儲到MONGODB失敗",result)def main(): try: total = search() total = int(re.compile("(\d+)").search(total).group(1)) for i in range(2,total + 1): next_page(i) except Exception: print("出錯") finally: driver.quit()if __name__ == '__main__': main()