待爬取的牛客網的實習信息java
https://www.nowcoder.com/job/centernode
首先在Eclipse新建一個maven項目apache
一、在maven文件中加入如下的代碼maven
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.wu</groupId> <artifactId>TopEssay</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> </project>
二、提取所須要的信息ide
這裏編寫CSS規則,有點麻煩,咱們能夠利用遊覽器自帶的工具,幫助咱們快速選擇所須要的的元素工具
好比咱們這裏的標題,經過這種該方法,爲 body > div.nk-container > div.nk-main.clearfix > div.nk-content > div > div.module-body > ul > li:nth-child(1) > div > div.reco-job-cont > athis
而後咱們能夠在上面這個基礎上進行相應的修改,有效節省了咱們的時間。url
package com.jsoup; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.entity.JobInfo; public class NiuKeSpider { private static final String url = "https://www.nowcoder.com/job/center"; public static void main(String[] args) { try { // 獲取網頁的源代碼 Document document = Jsoup.connect(url).get(); // 篩選出和職位有關的網頁源碼 Elements jobs = document.getElementsByClass("reco-job-main"); System.out.println(jobs.size()); List<JobInfo> lists = new ArrayList<>(); //工做描述+公司+地點+工資+url for(Element element : jobs) { JobInfo jobInfo = new JobInfo(); jobInfo.setJobContent(element.getElementsByClass("reco-job-cont").text()); jobInfo.setUrl(element.select("div.reco-job-cont > a").attr("abs:href")); jobInfo.setCompany(element.getElementsByClass("reco-job-com").text()); jobInfo.setAddress(element.getElementsByClass("job-address").text()); jobInfo.setSalary(element.select("div.reco-job-info > div:nth-child(1) > span:nth-child(2)").text().trim()); lists.add(jobInfo); } for(JobInfo job : lists) { System.out.println(job); } } catch (IOException e) { e.printStackTrace(); } } }
三、封裝所需的信息spa
package com.entity; /** * 職位有關的信息 * @author Administrator * */ public class JobInfo { private String jobContent; private String url; private String company; private String address; private String Salary; public String getJobContent() { return jobContent; } public void setJobContent(String jobContent) { this.jobContent = jobContent; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getCompany() { return company; } public void setCompany(String company) { this.company = company; } public String getAddress() { return address; } public void setAddress(String address) { this.address = address; } public String getSalary() { return Salary; } public void setSalary(String salary) { Salary = salary; } @Override public String toString() { return "job [jobContent=" + jobContent + ", url=" + url + ", company=" + company + ", address=" + address + ", Salary=" + Salary + "]"; } }
四、運行結果:3d
總結: