java爬知乎問題的全部回答

忽然想爬知乎問題的答案, 而後就開始研究知乎頁面,剛開始是爬瀏覽器渲染好的頁面, 解析DOM,找到特定的標籤,html

後來發現,每次只能獲得頁面加載出來的幾條數據,想要更多就要下拉頁面,而後瀏覽器自動加載幾條數據,這樣的話解析DOM沒用啊,java

不能得到全部的回答,而後就搜索了下問題,發現可使用模擬瀏覽器發送請求給服務器,獲取服務器響應,解析返回的數據,正則表達式

有了方法,接着就是分析網絡請求了, 我用的是火狐瀏覽器, 按F12點擊 網絡json

鼠標定位到當前位置的最底端api

而後下拉滾動條,感受已經加載新內容了就能夠中止了, 這個時候請求新內容的url確定已經出來了, 剩下的就是找出這個url。數組

一種方法是看url的意思, 這個不太好看的出來,另外一種就是直接複製url到瀏覽器, 看返回結果,最後獲得的請求url是瀏覽器

把連接拿到瀏覽器地址欄,查看的結果是服務器

接下來就是寫代碼了cookie

import com.google.gson.Gson;
import com.google.gson.internal.LinkedTreeMap;
import java.io.*;
import java.net.*;
import java.util.*;


public class ZhiHu {
    public static void main(String[] args){
//請求連接 String url
= "https://www.zhihu.com/api/v4/questions/62209505/answers?" + "include=data[*].is_normal,admin_closed_comment,reward_info," + "is_collapsed,annotation_action,annotation_detail,collapse_reason," + "is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content," + "editable_content,voteup_count,reshipment_settings,comment_permission," + "created_time,updated_time,review_info,relevant_info,question,excerpt," + "relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[*].mark_infos[*].url;" + "data[*].author.follower_count,badge[?(type=best_answerer)].topics&sort_by=default";
//調用方法 StringBuffer stringBuffer
= sendGet(url,20,0);
//輸出結果 System.out.println(delHTMLTag(new String(stringBuffer))); }
public static StringBuffer sendGet(String baseUrl,int limit,int offset) { //存放每次獲取的返回結果 String responseResult = "";
//讀取服務器響應的流 BufferedReader bufferedReader
= null; //存放全部的回答內容 StringBuffer stringBuffer = new StringBuffer(); //每次返回的回答數 int num = 0; try { //更改連接的limit設置每次返回的回答條數, 更改offset設置查詢的起始位置 //即上一次的limit+offset是下一次的起始位置,通過試驗,每次最多隻能返回20條結果 String urlToConnect = baseUrl + "&limit="+limit+"&offset="+offset; URL url = new URL(urlToConnect); // 打開和URL之間的鏈接 URLConnection connection = url.openConnection(); // 設置通用的請求屬性,這個在上面的請求頭中能夠找到 connection.setRequestProperty("Referer","https://www.zhihu.com/question/276275499"); connection.setRequestProperty("origin","https://www.zhihu.com"); connection.setRequestProperty("x-udid","換成本身的udid"); connection.setRequestProperty("Cookie","換成本身的cookie值"); connection.setRequestProperty("accept", "application/json, text/plain, */*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("Host", "www.zhihu.com"); connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"); // 創建實際的鏈接 connection.connect(); // 定義 BufferedReader輸入流來讀取URL的響應 bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line = null; while ((line = bufferedReader.readLine()) != null) { responseResult += line; } //將返回結果轉成map Gson gson = new Gson(); Map<String, Object> map = new HashMap<String,Object>(); map = gson.fromJson(responseResult, map.getClass()); //獲取page信息 LinkedTreeMap<String,Object> pageList = (LinkedTreeMap<String,Object>)map.get("paging"); //獲得總條數 double totals = (Double)pageList.get("totals"); //等於0 說明查詢的是最後不足20條的回答 if(totals-(limit + offset) != 0) { //若是每頁的頁數加上起始位置與總條數的差大於20, 能夠遞歸查找下一個20條內容 if (totals - (limit + offset) > 20) { //追加返回結果 stringBuffer.append(sendGet(baseUrl, 20, limit + offset)); stringBuffer.append("\r\n");//換行,調整格式 } else { //若是不大於20,說明是最後的幾條了,這時須要修改limit的值 stringBuffer.append(sendGet(baseUrl, (int) (totals - (limit + offset)), limit + offset)); stringBuffer.append("\r\n"); } } //得到包含回答的數組 ArrayList<LinkedTreeMap<String,String>> dataList = (ArrayList<LinkedTreeMap<String,String>>)map.get("data"); //追加每一條回答,用於返回 for(LinkedTreeMap<String,String> contentLink : dataList){ stringBuffer.append(contentLink.get("content")+"\r\n\r\n"); num++;//本次查詢到多少條回答 } System.out.println("回答數 "+num); } catch (Exception e) { System.out.println("發送GET請求出現異常!" + e); e.printStackTrace(); } // 使用finally塊來關閉輸入流 finally { try { if (bufferedReader != null) { bufferedReader.close(); } } catch (Exception e2) { e2.printStackTrace(); } } //返回本次查到的全部回答 return stringBuffer; }
 private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定義script的正則表達式 private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定義style的正則表達式 private static final String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表達式 private static final String regEx_space = "\\s*|\t|\r|\n";//定義空格回車換行符 /** * @param htmlStr * @return * 刪除Html標籤 */ public static String delHTMLTag(String htmlStr) { Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 過濾script標籤  Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 過濾style標籤  Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 過濾html標籤   return htmlStr.replaceAll(" ",""); // 返回文本字符串  }
}

 輸出結果網絡

回答數量越多,顯示的時候須要的時間也就越長,顯示的順序跟頁面上的不同, 從下到上,20個一組顯示

 

 

爲了縮短等待時間, 使用多線程

ZhiHuSpider.java 先獲取總共的條數,而後offset每次增長20,循環建立線程
import com.google.gson.Gson;
import com.google.gson.internal.LinkedTreeMap;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

public class ZhiHuSpider {
    public static void main(String[] args){
        String baseUrl= "https://www.zhihu.com/api/v4/questions/62209505/answers?" +
                "include=data[*].is_normal,admin_closed_comment,reward_info," +
                "is_collapsed,annotation_action,annotation_detail,collapse_reason," +
                "is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content," +
                "editable_content,voteup_count,reshipment_settings,comment_permission," +
                "created_time,updated_time,review_info,relevant_info,question,excerpt," +
                "relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[*].mark_infos[*].url;" +
                "data[*].author.follower_count,badge[?(type=best_answerer)].topics&sort_by=default";
        //存放每次獲取的返回結果
        String responseResult = "";
        BufferedReader bufferedReader = null;
        //存放多有的的回答內容
        StringBuffer stringBuffer = new StringBuffer();
        //每次返回的回答數
        URLConnection connection = null;
        try {
            String urlToConnect = baseUrl+ "&limit="+20+"&offset="+0;
            URL url = new URL(urlToConnect);
            // 打開和URL之間的鏈接
            connection = url.openConnection();

        // 設置通用的請求屬性
        connection.setRequestProperty("Referer","https://www.zhihu.com/question/276275499");
        connection.setRequestProperty("origin","https://www.zhihu.com");
        connection.setRequestProperty("x-udid","換成本身的udid值");
        connection.setRequestProperty("Cookie","換成本身的cookie值");
        connection.setRequestProperty("accept", "application/json, text/plain, */*");
        connection.setRequestProperty("connection", "Keep-Alive");
        connection.setRequestProperty("Host", "www.zhihu.com");
        connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0");
        // 創建實際的鏈接
        connection.connect();
        // 定義 BufferedReader輸入流來讀取URL的響應
        bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
        String line = null;
        while ((line = bufferedReader.readLine()) != null) {
            responseResult += line;
        }
        //將返回結果轉成map
        Gson gson = new Gson();
        Map<String, Object> map = new HashMap<String,Object>();
        map = gson.fromJson(responseResult, map.getClass());
        //獲取page信息
        LinkedTreeMap<String,Object> pageList = (LinkedTreeMap<String,Object>)map.get("paging");
        //獲得總條數
        double totals = (Double)pageList.get("totals");
        for(int offset = 0 ;offset < totals; offset += 20){
            SpiderThread spiderThread = new SpiderThread(baseUrl,20,offset);
            new Thread(spiderThread).start();
        }
        } catch (IOException e) {
            e.printStackTrace();
        }finally{
                try {
                    if(bufferedReader != null) {
                        bufferedReader.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }
    }
}
SpiderThread.java   
import com.google.gson.Gson;
import com.google.gson.internal.LinkedTreeMap;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;


public class SpiderThread implements Runnable{
    private String baseUrl;
    private int limit;
    private int offset;
SpiderThread(String baseUrl,
int limit,int offset){ this.baseUrl = baseUrl; this.limit = limit; this.offset = offset; } public void run() { System.out.println(delHTMLTag(new String(sendGet(baseUrl,limit,offset)))); } public StringBuffer sendGet(String baseUrl,int limit,int offset) { //存放每次獲取的返回結果 String responseResult = ""; BufferedReader bufferedReader = null; //存放多有的的回答內容 StringBuffer stringBuffer = new StringBuffer(); //每次返回的回答數 int num = 0; try { //更改連接的limit設置每次返回的回答條數, 更改offset設置查詢的起始位置 //即上一次的limit+offset是下一次的起始位置,通過試驗,每次最多隻能返回20條結果 String urlToConnect = baseUrl + "&limit="+limit+"&offset="+offset; URL url = new URL(urlToConnect); // 打開和URL之間的鏈接 URLConnection connection = url.openConnection(); // 設置通用的請求屬性 connection.setRequestProperty("Referer","https://www.zhihu.com/question/276275499"); connection.setRequestProperty("origin","https://www.zhihu.com"); connection.setRequestProperty("x-udid","換成本身的udid值"); connection.setRequestProperty("Cookie","換成本身的cookie值"); connection.setRequestProperty("accept", "application/json, text/plain, */*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("Host", "www.zhihu.com"); connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"); // 創建實際的鏈接 connection.connect(); // 定義 BufferedReader輸入流來讀取URL的響應 bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line = null; while ((line = bufferedReader.readLine()) != null) { responseResult += line; } //將返回結果轉成map Gson gson = new Gson(); Map<String, Object> map = new HashMap<String,Object>(); map = gson.fromJson(responseResult, map.getClass()); //得到包含回答的數組 ArrayList<LinkedTreeMap<String,String>> dataList = (ArrayList<LinkedTreeMap<String,String>>)map.get("data"); //追加每一條回答,用於返回 // String str = null; for(LinkedTreeMap<String,String> contentLink : dataList){ stringBuffer.append(contentLink.get("content")+"\r\n\r\n"); num++;//本次查詢到多少條回答 } System.out.println("回答條數====================================="+num); } catch (Exception e) { System.out.println("發送GET請求出現異常!" + e); e.printStackTrace(); } // 使用finally塊來關閉輸入流 finally { try { if (bufferedReader != null) { bufferedReader.close(); } } catch (Exception e2) { e2.printStackTrace(); } } //返回本次查到的全部回答 return stringBuffer; }
    private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定義script的正則表達式 private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定義style的正則表達式 private static final String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表達式 private static final String regEx_space = "\\s*|\t|\r|\n";//定義空格回車換行 /** * @param htmlStr * @return * 刪除Html標籤 */ public static String delHTMLTag(String htmlStr) { Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 過濾script標籤  Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 過濾style標籤  Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 過濾html標籤  return htmlStr.replaceAll(" ",""); // 返回文本字符串  }
}

 遇到一個問題

 

試了下, 其餘的都是沒問題的,totals就是總條數,就這個有問題,

連接是:https://www.zhihu.com/api/v4/questions/62209505/answers?include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&limit=20&offset=2020&sort_by=default

 

2018.6.1

添加了個去掉html標籤的方法, 顯示的更好看點

相關文章
相關標籤/搜索