模擬登錄+數據爬取 (python+selenuim)

如下代碼是用來爬取LinkedIn網站一些學者的經歷的,僅供參考,注意:不要一次性大量爬取會被封號,不要問我爲何知道web

#-*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup

diver=webdriver.Chrome()
diver.get('https://www.linkedin.com/')
#等待網站加載完成
time.sleep(1)
#模擬登錄
diver.find_element_by_id('login-email').send_keys(用戶名)
diver.find_element_by_id('login-password').send_keys(密碼)
# 點擊跳轉
diver.find_element_by_id('login-submit').send_keys(Keys.ENTER)
time.sleep(1)
#查詢
 diver.find_element_by_tag_name('input').send_keys(學者名)
diver.find_element_by_tag_name('input').send_keys(Keys.ENTER)
time.sleep(1)
#獲取當前頁面全部可能的人
soup=BeautifulSoup(diver.page_source,'lxml')
items=soup.findAll('div',{'class':'search-result__wrapper'})
n=0
for i in items:
n+=1
title=i.find('div',{'class':'search-result__image-wrapper'}).find('a')['href']
diver.get('https://www.linkedin.com'+title)
time.sleep(3)
Soup=BeautifulSoup(diver.page_source,'lxml')
# print Soup
Items=Soup.findAll('li',{'class':'pv-profile-section__card-item pv-position-entity ember-view'})
print str(n)+':'
for i in Items:
    print i.find('div',{'class':'pv-entity__summary-info'}).get_text().replace('\n','')
diver.close()
相關文章
相關標籤/搜索