java爬知乎問題的全部回答

時間 2019-12-14

標籤 java 問題全部回答欄目 Java 简体版

原文原文鏈接

忽然想爬知乎問題的答案，而後就開始研究知乎頁面，剛開始是爬瀏覽器渲染好的頁面，解析DOM，找到特定的標籤，html

後來發現，每次只能獲得頁面加載出來的幾條數據，想要更多就要下拉頁面，而後瀏覽器自動加載幾條數據，這樣的話解析DOM沒用啊，java

不能得到全部的回答，而後就搜索了下問題，發現可使用模擬瀏覽器發送請求給服務器，獲取服務器響應，解析返回的數據，正則表達式

有了方法，接着就是分析網絡請求了，我用的是火狐瀏覽器，按F12點擊網絡json

鼠標定位到當前位置的最底端api

而後下拉滾動條，感受已經加載新內容了就能夠中止了，這個時候請求新內容的url確定已經出來了，剩下的就是找出這個url。數組

一種方法是看url的意思，這個不太好看的出來，另外一種就是直接複製url到瀏覽器，看返回結果，最後獲得的請求url是瀏覽器

把連接拿到瀏覽器地址欄，查看的結果是服務器

接下來就是寫代碼了cookie

import com.google.gson.Gson;
import com.google.gson.internal.LinkedTreeMap;
import java.io.*;
import java.net.*;
import java.util.*;


public class ZhiHu {
    public static void main(String[] args){
        //請求連接
        String url = "https://www.zhihu.com/api/v4/questions/62209505/answers?" +
                "include=data[*].is_normal,admin_closed_comment,reward_info," +
                "is_collapsed,annotation_action,annotation_detail,collapse_reason," +
                "is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content," +
                "editable_content,voteup_count,reshipment_settings,comment_permission," +
                "created_time,updated_time,review_info,relevant_info,question,excerpt," +
                "relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[*].mark_infos[*].url;" +
                "data[*].author.follower_count,badge[?(type=best_answerer)].topics&sort_by=default";
        //調用方法
        StringBuffer stringBuffer = sendGet(url,20,0);
        //輸出結果
        System.out.println(delHTMLTag(new String(stringBuffer)));
    }
    public static StringBuffer sendGet(String baseUrl,int limit,int offset) {
        //存放每次獲取的返回結果
        String responseResult = "";
        //讀取服務器響應的流
        BufferedReader bufferedReader = null;
        //存放全部的回答內容
        StringBuffer stringBuffer = new StringBuffer();
        //每次返回的回答數
        int num = 0;
        try {
            //更改連接的limit設置每次返回的回答條數, 更改offset設置查詢的起始位置
            //即上一次的limit+offset是下一次的起始位置,通過試驗,每次最多隻能返回20條結果
            String urlToConnect = baseUrl + "&limit="+limit+"&offset="+offset;
            URL url = new URL(urlToConnect);
            // 打開和URL之間的鏈接
            URLConnection connection = url.openConnection();
            // 設置通用的請求屬性,這個在上面的請求頭中能夠找到
            connection.setRequestProperty("Referer","https://www.zhihu.com/question/276275499");
            connection.setRequestProperty("origin","https://www.zhihu.com");
            connection.setRequestProperty("x-udid","換成本身的udid");
            connection.setRequestProperty("Cookie","換成本身的cookie值");
            connection.setRequestProperty("accept", "application/json, text/plain, */*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("Host", "www.zhihu.com");
            connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0");
            // 創建實際的鏈接
            connection.connect();
            // 定義 BufferedReader輸入流來讀取URL的響應
            bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            String line = null;
            while ((line = bufferedReader.readLine()) != null) {
                responseResult += line;
            }
            //將返回結果轉成map
            Gson gson = new Gson();
            Map<String, Object> map = new HashMap<String,Object>();
            map = gson.fromJson(responseResult, map.getClass());
            //獲取page信息
            LinkedTreeMap<String,Object> pageList = (LinkedTreeMap<String,Object>)map.get("paging");
            //獲得總條數
            double totals = (Double)pageList.get("totals");
            //等於0 說明查詢的是最後不足20條的回答
            if(totals-(limit + offset) != 0) {
                //若是每頁的頁數加上起始位置與總條數的差大於20, 能夠遞歸查找下一個20條內容
                if (totals - (limit + offset) > 20) {
                    //追加返回結果
                     stringBuffer.append(sendGet(baseUrl, 20, limit + offset));
                     stringBuffer.append("\r\n");//換行,調整格式

                } else {
                    //若是不大於20,說明是最後的幾條了,這時須要修改limit的值
                    stringBuffer.append(sendGet(baseUrl, (int) (totals - (limit + offset)), limit + offset));
                    stringBuffer.append("\r\n");
                }
            }
            //得到包含回答的數組
            ArrayList<LinkedTreeMap<String,String>> dataList = (ArrayList<LinkedTreeMap<String,String>>)map.get("data");
            //追加每一條回答,用於返回
            for(LinkedTreeMap<String,String> contentLink : dataList){
                stringBuffer.append(contentLink.get("content")+"\r\n\r\n");
                num++;//本次查詢到多少條回答
            }
            System.out.println("回答數  "+num);
        } catch (Exception e) {
            System.out.println("發送GET請求出現異常！" + e);
            e.printStackTrace();
        }
        // 使用finally塊來關閉輸入流
        finally {
            try {
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        //返回本次查到的全部回答
        return stringBuffer;
    }

 private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定義script的正則表達式 private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定義style的正則表達式 private static final String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表達式 private static final String regEx_space = "\\s*|\t|\r|\n";//定義空格回車換行符 /** * @param htmlStr * @return * 刪除Html標籤 */ public static String delHTMLTag(String htmlStr) { Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 過濾script標籤  Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 過濾style標籤  Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 過濾html標籤   return htmlStr.replaceAll(" ",""); // 返回文本字符串  }

輸出結果網絡

回答數量越多，顯示的時候須要的時間也就越長，顯示的順序跟頁面上的不同，從下到上，20個一組顯示

爲了縮短等待時間，使用多線程

ZhiHuSpider.java 先獲取總共的條數,而後offset每次增長20,循環建立線程

import com.google.gson.Gson;
import com.google.gson.internal.LinkedTreeMap;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

public class ZhiHuSpider {
    public static void main(String[] args){
        String baseUrl= "https://www.zhihu.com/api/v4/questions/62209505/answers?" +
                "include=data[*].is_normal,admin_closed_comment,reward_info," +
                "is_collapsed,annotation_action,annotation_detail,collapse_reason," +
                "is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content," +
                "editable_content,voteup_count,reshipment_settings,comment_permission," +
                "created_time,updated_time,review_info,relevant_info,question,excerpt," +
                "relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[*].mark_infos[*].url;" +
                "data[*].author.follower_count,badge[?(type=best_answerer)].topics&sort_by=default";
        //存放每次獲取的返回結果
        String responseResult = "";
        BufferedReader bufferedReader = null;
        //存放多有的的回答內容
        StringBuffer stringBuffer = new StringBuffer();
        //每次返回的回答數
        URLConnection connection = null;
        try {
            String urlToConnect = baseUrl+ "&limit="+20+"&offset="+0;
            URL url = new URL(urlToConnect);
            // 打開和URL之間的鏈接
            connection = url.openConnection();

        // 設置通用的請求屬性
        connection.setRequestProperty("Referer","https://www.zhihu.com/question/276275499");
        connection.setRequestProperty("origin","https://www.zhihu.com");
        connection.setRequestProperty("x-udid","換成本身的udid值");
        connection.setRequestProperty("Cookie","換成本身的cookie值");
        connection.setRequestProperty("accept", "application/json, text/plain, */*");
        connection.setRequestProperty("connection", "Keep-Alive");
        connection.setRequestProperty("Host", "www.zhihu.com");
        connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0");
        // 創建實際的鏈接
        connection.connect();
        // 定義 BufferedReader輸入流來讀取URL的響應
        bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
        String line = null;
        while ((line = bufferedReader.readLine()) != null) {
            responseResult += line;
        }
        //將返回結果轉成map
        Gson gson = new Gson();
        Map<String, Object> map = new HashMap<String,Object>();
        map = gson.fromJson(responseResult, map.getClass());
        //獲取page信息
        LinkedTreeMap<String,Object> pageList = (LinkedTreeMap<String,Object>)map.get("paging");
        //獲得總條數
        double totals = (Double)pageList.get("totals");
        for(int offset = 0 ;offset < totals; offset += 20){
            SpiderThread spiderThread = new SpiderThread(baseUrl,20,offset);
            new Thread(spiderThread).start();
        }
        } catch (IOException e) {
            e.printStackTrace();
        }finally{
                try {
                    if(bufferedReader != null) {
                        bufferedReader.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }
    }
}

SpiderThread.java

import com.google.gson.Gson;
import com.google.gson.internal.LinkedTreeMap;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;


public class SpiderThread implements Runnable{
    private String baseUrl;
    private int limit;
    private int offset;

    SpiderThread(String baseUrl,int limit,int offset){
        this.baseUrl = baseUrl;
        this.limit = limit;
        this.offset = offset;
    }
    public void run() {
        System.out.println(delHTMLTag(new String(sendGet(baseUrl,limit,offset))));
    }
    public  StringBuffer sendGet(String baseUrl,int limit,int offset) {
        //存放每次獲取的返回結果
        String responseResult = "";
        BufferedReader bufferedReader = null;
        //存放多有的的回答內容
        StringBuffer stringBuffer = new StringBuffer();
        //每次返回的回答數
        int num = 0;
        try {
            //更改連接的limit設置每次返回的回答條數, 更改offset設置查詢的起始位置
            //即上一次的limit+offset是下一次的起始位置,通過試驗,每次最多隻能返回20條結果
            String urlToConnect = baseUrl + "&limit="+limit+"&offset="+offset;
            URL url = new URL(urlToConnect);
            // 打開和URL之間的鏈接
            URLConnection connection = url.openConnection();
            // 設置通用的請求屬性
            connection.setRequestProperty("Referer","https://www.zhihu.com/question/276275499");
            connection.setRequestProperty("origin","https://www.zhihu.com");
            connection.setRequestProperty("x-udid","換成本身的udid值");
            connection.setRequestProperty("Cookie","換成本身的cookie值");
            connection.setRequestProperty("accept", "application/json, text/plain, */*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("Host", "www.zhihu.com");
            connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0");
            // 創建實際的鏈接
            connection.connect();
            // 定義 BufferedReader輸入流來讀取URL的響應
            bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            String line = null;
            while ((line = bufferedReader.readLine()) != null) {
                responseResult += line;
            }
            //將返回結果轉成map
            Gson gson = new Gson();
            Map<String, Object> map = new HashMap<String,Object>();
            map = gson.fromJson(responseResult, map.getClass());
            //得到包含回答的數組
            ArrayList<LinkedTreeMap<String,String>> dataList = (ArrayList<LinkedTreeMap<String,String>>)map.get("data");
            //追加每一條回答,用於返回
            // String str = null;
            for(LinkedTreeMap<String,String> contentLink : dataList){
                stringBuffer.append(contentLink.get("content")+"\r\n\r\n");
                num++;//本次查詢到多少條回答
            }
            System.out.println("回答條數====================================="+num);
        } catch (Exception e) {
            System.out.println("發送GET請求出現異常！" + e);
            e.printStackTrace();
        }
        // 使用finally塊來關閉輸入流
        finally {
            try {
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        //返回本次查到的全部回答
        return stringBuffer;
    }

    private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定義script的正則表達式 private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定義style的正則表達式 private static final String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表達式 private static final String regEx_space = "\\s*|\t|\r|\n";//定義空格回車換行符 /** * @param htmlStr * @return * 刪除Html標籤 */ public static String delHTMLTag(String htmlStr) { Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 過濾script標籤  Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 過濾style標籤  Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 過濾html標籤  return htmlStr.replaceAll(" ",""); // 返回文本字符串  }

遇到一個問題

試了下，其餘的都是沒問題的，totals就是總條數，就這個有問題，

連接是：https://www.zhihu.com/api/v4/questions/62209505/answers?include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&limit=20&offset=2020&sort_by=default

2018.6.1

添加了個去掉html標籤的方法, 顯示的更好看點