第一個入手的爬蟲小任務:java
maven工程node
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.zhaowu</groupId> <artifactId>pachong01</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.2</version> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.6</version> </dependency> </dependencies> </project>
代碼實現:apache
package com.zhaowu.renwu1; import java.io.IOException; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class News { public static void main(String[] args) throws ClientProtocolException, IOException { // 建立HttpClient實例 CloseableHttpClient httpClient = HttpClients.createDefault(); // 建立httpget實例 HttpGet httpGet = new HttpGet("https://news.baidu.com/"); RequestConfig config = RequestConfig.custom() .setConnectTimeout(10000)//設置鏈接超時時間10秒鐘,單位毫秒 .setSocketTimeout(10000) //設置讀取超時時間10秒鐘 .build(); httpGet.setConfig(config); // 設置請求頭消息User-Agent模擬瀏覽器 httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0"); // 執行get請求 CloseableHttpResponse response = httpClient.execute(httpGet); // 獲取返回實體 HttpEntity entity = response.getEntity(); // 實體的內容(編碼格式爲utf-8) String content = EntityUtils.toString(entity, "utf-8"); // System.out.println("網頁內容爲: " + content); // 解析網頁 獲得文檔對象 Document doc = Jsoup.parse(content); Elements hrefElements = doc.select("a[href]");// 選擇全部的a元素 for (Element e : hrefElements) { System.out.println("新聞標題:" + e.text()); System.out.println("新聞地址:" + e.attr("href")); System.out.println("------------------------"); } } }