httpClient爬蟲

 

 

package httpClient.client;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HttpClinet {

    public static void main(String[] args) throws ClientProtocolException, IOException {
        // 圖片路徑
        String url = "https://www.mzitu.com/";
        // 建立httpClient實例
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpClinet t = new HttpClinet();
        HttpEntity httpEntity = t.linek(httpClient, url);
        String html = EntityUtils.toString(httpEntity, "UTF-8");
        Document document = Jsoup.parse(html);
        // 像js同樣,經過標籤獲取title
        // System.out.println(document.getElementsByTag("title").first());
        // 像js同樣,經過id 獲取文章列表元素對象
        Element postList = document.getElementById("pins");
        // 像js同樣,經過class 獲取列表下的全部博客
        Elements postItems = postList.select("li a");
        // 循環處理每篇博客
        String s = "0";
        for (Element postItem : postItems) {
            String urls = postItem.attr("href").trim();
            if (!s.equals(urls)) {
                s = urls;
                HttpEntity httpEntitys = t.linek(httpClient, urls);
                String htmls = EntityUtils.toString(httpEntitys, "UTF-8");
                Document documents = Jsoup.parse(htmls);
                String postLists = documents.getElementsByClass("main-image").first().select("p a img").attr("src");
                if (postLists != null) {
                    System.out.println(postLists);
                    t.ulr(postLists, httpClient);
                }
            }
        }
    }

    public void ulr(String url, CloseableHttpClient httpClient) throws ClientProtocolException, IOException {
        String fileName = url.substring(url.lastIndexOf("."), url.length());
        HttpEntity entity = this.linek(httpClient, url); // 獲取返回實體
        if (entity != null) {
            System.out.println("Content-Type:" + entity.getContentType().getValue());
            InputStream inputStream = entity.getContent();
            // 文件複製,common io 包下,須要 引入依賴
            FileUtils.copyToFile(inputStream, new File(UUID.randomUUID() + fileName));
        }
    }

    public void close(CloseableHttpResponse response, CloseableHttpClient httpClient) throws IOException {
        if (response != null) {
            response.close();
        }
        if (httpClient != null) {
            httpClient.close();
        }
    }

    public HttpEntity linek(CloseableHttpClient httpClient, String url) throws ClientProtocolException, IOException {
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("If-None-Match", "W/\"5cc2cd8f-2c58");
        httpGet.setHeader("Referer", "http://www.mzitu.com/all/");
        httpGet.setHeader("User-Agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36");
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response.getEntity();
    }
}
相關文章
相關標籤/搜索