JAVA爬蟲實踐(實踐二:博客園)

分析博客園網站的請求能夠發現,博客園的分頁請求爲POST方式,和知乎的滾動加載相似。html

不一樣的是請求響應返回的是HTML而不是JSON。java

這樣能夠套用上一篇爬知乎的代碼,須要修改的部分就是POST方法傳的參數,直接用map,還有解析HTML的部分。node

 模擬POST請求apache

public String doPost(Map<String, String> args) throws Exception {
    HttpClient httpClient = new DefaultHttpClient();

    RequestBuilder builder = RequestBuilder.post()
            .setUri("http://www.cnblogs.com/mvc/AggSite/PostList.aspx");
    Set<String> keys = args.keySet();
    for (String key : keys) {
        builder.addParameter(key,args.get(key));
    }

    HttpUriRequest httpUriRequest = builder.build();
    
    // 添加必要的頭信息
    httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
    httpUriRequest.setHeader("Cookie", "這裏的仍是要用本身的Cookie");
    httpUriRequest.setHeader("DNT", "1");
    httpUriRequest.setHeader("Connection", "keep-alive");
    httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");
    httpUriRequest.setHeader("If-Modified-Since", "Wed, 12 Apr 2017 03:10:29 GMT");

    HttpResponse response = httpClient.execute(httpUriRequest);

    String str = "";
    HttpEntity entity = response.getEntity();
    if (entity != null) {
        InputStream instreams = entity.getContent();
        str = convertStreamToString(instreams);
    }
    return str;

}

HTML內容的提取部分json

由於HTML的全部標籤元素id惟一能夠找到一個距離較近的帶id的元素,向下取到內容。mvc

這裏仍是較多的用get(0)來取元素。app

public String unparsedData(String html) {

    Document doc = Jsoup.parse(html);
    Elements elements = doc.getElementsByAttributeValue("class",
            "post_item");

    String writeStr = "";
    for (Element element : elements) {
        //推薦數量    
        Elements diggs = element.getElementsByAttributeValue("class", "digg");
        String digg = diggs.get(0).getElementsByTag("span").text().trim();

        Elements postItemBodys = element.getElementsByAttributeValue("class", "post_item_body");
        //標題
        String titleHref = postItemBodys.get(0).getElementsByTag("h3").get(0).getElementsByAttributeValue("class", "titlelnk").get(0).attr("href");
        String titleText = postItemBodys.get(0).getElementsByTag("h3").get(0).getElementsByAttributeValue("class", "titlelnk").get(0).text().trim();
        //摘要
        String contentText = postItemBodys.get(0).getElementsByAttributeValue("class", "post_item_summary").get(0).text().trim();
        
        
        System.out.println("--------------------");
        System.out.println("-----標題-----");
        System.out.println("推薦:" + digg);
        System.out.println("連接:" + titleHref);
        System.out.println("內容:" + titleText);
        System.out.println("-----內容-----");
        System.out.println("內容:" + contentText);
        System.out.println("--------------------");

        writeStr += "--------------------\n-----標題-----推薦:"+digg+"\n" + titleHref
                + "\n" + titleText + "\n-----內容-----\n" + contentText
                + "\n--------------------\n\n\n";
    }
    return writeStr;
}

完整代碼ide

package spider;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

@SuppressWarnings("deprecation")
public class CnblogsSpider {

    // 下載 URL 指向的網頁
    @SuppressWarnings("static-access")
    @Test
    public void downloadFile() throws Exception {

        // 模擬HTTP GET請求
        String responseBody = doGet();
        // 解析數據
        String writeStr = unparsedData(responseBody);
        // 建立新文件
        String path = "D:\\testFile\\cnblogs.txt";
        PrintWriter printWriter = null;
        printWriter = new PrintWriter(new FileWriter(new File(path)));
        printWriter.write(writeStr);
        printWriter.close();

        Map<String, String> args = new HashMap<String, String>();
        args.put("CategoryId", "808");
        args.put("CategoryType", "\"SiteHome\"");
        args.put("ItemListActionName", "\"PostList\"");
        args.put("ParentCategoryId", "0");
        args.put("TotalPostCount", "4000");
        for (int time = 2; time <= 200; time++) {
            // 延時,調整參數
            Thread.currentThread().sleep(200);// 毫秒
            args.put("PageIndex", time + "");
            // 模擬JS發送POST請求
            String json = doPost(args);
            // 解析數據
            String addWriteStr = "";
            addWriteStr += unparsedData(json);
            // 追加文本
            printWriter = new PrintWriter(new FileWriter(path, true));
            printWriter.write(addWriteStr);
            printWriter.close();
        }

    }

    /**
     * 模擬HTTP GET請求
     * 
     * @return 請求返回的JSON數據
     */
    public String doGet() throws ClientProtocolException, IOException {
        // 建立HttpClient實例
        HttpClient httpClient = new DefaultHttpClient();
        // 建立Get方法實例
        HttpUriRequest httpUriRequest = new HttpGet("http://www.cnblogs.com");
        // 添加必要的頭信息
        httpUriRequest
                .setHeader("User-Agent",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
        httpUriRequest
                .setHeader(
                        "Cookie",
                        "這裏的仍是要用本身的Cookie");
        httpUriRequest.setHeader("DNT", "1");
        httpUriRequest.setHeader("Connection", "keep-alive");
        httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");
        httpUriRequest.setHeader("If-Modified-Since",
                "Wed, 12 Apr 2017 03:10:29 GMT");

        HttpResponse response = httpClient.execute(httpUriRequest);

        String json = "";
        HttpEntity entity = response.getEntity();
        if (entity != null) {
            InputStream instreams = entity.getContent();
            json = convertStreamToString(instreams);
        }
        return json;
    }

    /**
     * 模擬HTTP POST請求
     * 
     * @param offset
     *            參數offset
     * @param start
     *            參數start
     * @return 請求返回的JSON數據
     */
    public String doPost(Map<String, String> args) throws Exception {
        HttpClient httpClient = new DefaultHttpClient();

        RequestBuilder builder = RequestBuilder.post().setUri(
                "http://www.cnblogs.com/mvc/AggSite/PostList.aspx");
        Set<String> keys = args.keySet();
        for (String key : keys) {
            builder.addParameter(key, args.get(key));
        }

        HttpUriRequest httpUriRequest = builder.build();

        // 添加必要的頭信息
        httpUriRequest
                .setHeader("User-Agent",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
        httpUriRequest
                .setHeader(
                        "Cookie",
                        "這裏的仍是要用本身的Cookie");
        httpUriRequest.setHeader("DNT", "1");
        httpUriRequest.setHeader("Connection", "keep-alive");
        httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");
        httpUriRequest.setHeader("If-Modified-Since",
                "Wed, 12 Apr 2017 03:10:29 GMT");

        HttpResponse response = httpClient.execute(httpUriRequest);

        String str = "";
        HttpEntity entity = response.getEntity();
        if (entity != null) {
            InputStream instreams = entity.getContent();
            str = convertStreamToString(instreams);
        }
        return str;

    }

    public static String convertStreamToString(InputStream is)
            throws IOException {

        InputStreamReader ir = new InputStreamReader(is, "UTF8");

        BufferedReader reader = new BufferedReader(ir);

        StringBuilder sb = new StringBuilder();

        String line = null;
        try {
            while ((line = reader.readLine()) != null) {
                sb.append(line + "\n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();
    }

    /**
     * 根據HTML解析數據
     * 
     * @param html
     *            源HTML
     * @return 解析後的數據
     */
    public String unparsedData(String html) {

        Document doc = Jsoup.parse(html);
        Elements elements = doc.getElementsByAttributeValue("class",
                "post_item");

        String writeStr = "";
        for (Element element : elements) {
            // 推薦數量
            Elements diggs = element.getElementsByAttributeValue("class",
                    "digg");
            String digg = diggs.get(0).getElementsByTag("span").text().trim();

            Elements postItemBodys = element.getElementsByAttributeValue(
                    "class", "post_item_body");
            // 標題
            String titleHref = postItemBodys.get(0).getElementsByTag("h3")
                    .get(0).getElementsByAttributeValue("class", "titlelnk")
                    .get(0).attr("href");
            String titleText = postItemBodys.get(0).getElementsByTag("h3")
                    .get(0).getElementsByAttributeValue("class", "titlelnk")
                    .get(0).text().trim();
            // 摘要
            String contentText = postItemBodys.get(0)
                    .getElementsByAttributeValue("class", "post_item_summary")
                    .get(0).text().trim();

            System.out.println("--------------------");
            System.out.println("-----標題-----");
            System.out.println("推薦:" + digg);
            System.out.println("連接:" + titleHref);
            System.out.println("內容:" + titleText);
            System.out.println("-----內容-----");
            System.out.println("內容:" + contentText);
            System.out.println("--------------------");

            writeStr += "--------------------\n-----標題-----推薦:" + digg + "\n"
                    + titleHref + "\n" + titleText + "\n-----內容-----\n"
                    + contentText + "\n--------------------\n\n\n";
        }
        return writeStr;
    }

}
View Code
相關文章
相關標籤/搜索