如下代碼是用來爬取LinkedIn網站一些學者的經歷的,僅供參考,注意:不要一次性大量爬取會被封號,不要問我爲何知道web
#-*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys import time from bs4 import BeautifulSoup diver=webdriver.Chrome() diver.get('https://www.linkedin.com/') #等待網站加載完成 time.sleep(1) #模擬登錄 diver.find_element_by_id('login-email').send_keys(用戶名) diver.find_element_by_id('login-password').send_keys(密碼) # 點擊跳轉 diver.find_element_by_id('login-submit').send_keys(Keys.ENTER) time.sleep(1) #查詢 diver.find_element_by_tag_name('input').send_keys(學者名) diver.find_element_by_tag_name('input').send_keys(Keys.ENTER) time.sleep(1) #獲取當前頁面全部可能的人 soup=BeautifulSoup(diver.page_source,'lxml') items=soup.findAll('div',{'class':'search-result__wrapper'}) n=0 for i in items: n+=1 title=i.find('div',{'class':'search-result__image-wrapper'}).find('a')['href'] diver.get('https://www.linkedin.com'+title) time.sleep(3) Soup=BeautifulSoup(diver.page_source,'lxml') # print Soup Items=Soup.findAll('li',{'class':'pv-profile-section__card-item pv-position-entity ember-view'}) print str(n)+':' for i in Items: print i.find('div',{'class':'pv-entity__summary-info'}).get_text().replace('\n','') diver.close()