爬蟲任務一:使用httpclient去爬取百度新聞首頁的新聞標題和url,編碼是utf-8

第一個入手的爬蟲小任務:java

maven工程node

<project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.zhaowu</groupId>
    <artifactId>pachong01</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>


    </dependencies>
</project>

代碼實現:apache

package com.zhaowu.renwu1;

import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class News {
    public static void main(String[] args) throws ClientProtocolException, IOException {
        // 建立HttpClient實例
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 建立httpget實例
        HttpGet httpGet = new HttpGet("https://news.baidu.com/");
        
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(10000)//設置鏈接超時時間10秒鐘,單位毫秒
                .setSocketTimeout(10000) //設置讀取超時時間10秒鐘
                .build();
        httpGet.setConfig(config);
        // 設置請求頭消息User-Agent模擬瀏覽器
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0");
        // 執行get請求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        // 獲取返回實體
        HttpEntity entity = response.getEntity();
        // 實體的內容(編碼格式爲utf-8)
        String content = EntityUtils.toString(entity, "utf-8");
        // System.out.println("網頁內容爲: " + content);

        // 解析網頁 獲得文檔對象
        Document doc = Jsoup.parse(content);    
        
        Elements hrefElements = doc.select("a[href]");// 選擇全部的a元素
        for (Element e : hrefElements) {
            System.out.println("新聞標題:" + e.text());
            System.out.println("新聞地址:" + e.attr("href"));
            System.out.println("------------------------");
        }
        
    }
}
相關文章
相關標籤/搜索