python 爬蟲系列09-selenium+拉鉤

使用selenium爬取拉勾網職位html
 1 from selenium import webdriver  2 from lxml import etree  3 import re  4 import time  5 from selenium.webdriver.support.ui import WebDriverWait  6 from selenium.webdriver.support import expected_conditions as EC  7 from selenium.webdriver.common.by import By  8 class LagouSpider(object):  9     driver_path = r"D:\driver\chromedriver.exe"
10 
11     def __init__(self): 12         self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path) 13         self.url = 'https://www.lagou.com/jobs/list_%E4%BA%91%E8%AE%A1%E7%AE%97?labelWords=&fromSearch=true&suginput='
14         self.positions = [] 15 
16     def run(self): 17  self.driver.get(self.url) 18         while True: 19             source = self.driver.page_source 20             WebDriverWait(driver=self.driver,timeout=10).until( 21                 EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]")) 22  ) 23  self.parse_list_page(source) 24             try: 25                 next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]") 26                 if "pager_next_disabled" in next_btn.get_attribute("class"): 27                     break
28                 else: 29  next_btn.click() 30             except: 31                 print(source) 32 
33             time.sleep(1) 34 
35     def parse_list_page(self,source): 36         html = etree.HTML(source) 37         links = html.xpath("//a[@class='position_link']/@href") 38         for link in links: 39  self.request_detail_page(link) 40             time.sleep(1) 41 
42     def request_detail_page(self,url): 43         # self.driver.get(url)
44         print() 45         print(url) 46         print() 47         self.driver.execute_script("window.open('%s')" % url) 48         self.driver.switch_to.window(self.driver.window_handles[1]) 49         WebDriverWait(self.driver,timeout=10).until( 50             EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']")) 51  ) 52         source = self.driver.page_source 53  self.parse_detail_page(source) 54  self.driver.close() 55  self.driver.switch_to.window(self.driver.window_handles[0]) 56 
57     def parse_detail_page(self,source): 58         html = etree.HTML(source) 59         position_name = html.xpath("//span[@class='name']/text()")[0] 60         job_request_spans = html.xpath("//dd[@class='job_request']//span") 61         salary = job_request_spans[0].xpath('.//text()')[0].strip() 62         city = job_request_spans[1].xpath(".//text()")[0].strip() 63         city = re.sub(r"[\s/]", "", city) 64         work_years = job_request_spans[2].xpath(".//text()")[0].strip() 65         work_years = re.sub(r"[\s/]", "", work_years) 66         education = job_request_spans[3].xpath(".//text()")[0].strip() 67         education = re.sub(r"[\s/]", "", education) 68         desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() 69         company_name = html.xpath("//h2[@class='f1']/text()") 70         position = { 71             'name': position_name, 72             'company_name': company_name, 73             'salary': salary, 74             'city': city, 75             'work_years': work_years, 76             'education': education, 77             'desc': desc 78  } 79  self.positions.append(position) 80         print(position) 81 if __name__ == '__main__': 82     spider = LagouSpider() 83     spider.run()