分析博客園網站的請求能夠發現,博客園的分頁請求爲POST方式,和知乎的滾動加載相似。html
不一樣的是請求響應返回的是HTML而不是JSON。java
這樣能夠套用上一篇爬知乎的代碼,須要修改的部分就是POST方法傳的參數,直接用map,還有解析HTML的部分。node
模擬POST請求apache
public String doPost(Map<String, String> args) throws Exception { HttpClient httpClient = new DefaultHttpClient(); RequestBuilder builder = RequestBuilder.post() .setUri("http://www.cnblogs.com/mvc/AggSite/PostList.aspx"); Set<String> keys = args.keySet(); for (String key : keys) { builder.addParameter(key,args.get(key)); } HttpUriRequest httpUriRequest = builder.build(); // 添加必要的頭信息 httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest.setHeader("Cookie", "這裏的仍是要用本身的Cookie"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1"); httpUriRequest.setHeader("If-Modified-Since", "Wed, 12 Apr 2017 03:10:29 GMT"); HttpResponse response = httpClient.execute(httpUriRequest); String str = ""; HttpEntity entity = response.getEntity(); if (entity != null) { InputStream instreams = entity.getContent(); str = convertStreamToString(instreams); } return str; }
HTML內容的提取部分json
由於HTML的全部標籤元素id惟一能夠找到一個距離較近的帶id的元素,向下取到內容。mvc
這裏仍是較多的用get(0)來取元素。app
public String unparsedData(String html) { Document doc = Jsoup.parse(html); Elements elements = doc.getElementsByAttributeValue("class", "post_item"); String writeStr = ""; for (Element element : elements) { //推薦數量 Elements diggs = element.getElementsByAttributeValue("class", "digg"); String digg = diggs.get(0).getElementsByTag("span").text().trim(); Elements postItemBodys = element.getElementsByAttributeValue("class", "post_item_body"); //標題 String titleHref = postItemBodys.get(0).getElementsByTag("h3").get(0).getElementsByAttributeValue("class", "titlelnk").get(0).attr("href"); String titleText = postItemBodys.get(0).getElementsByTag("h3").get(0).getElementsByAttributeValue("class", "titlelnk").get(0).text().trim(); //摘要 String contentText = postItemBodys.get(0).getElementsByAttributeValue("class", "post_item_summary").get(0).text().trim(); System.out.println("--------------------"); System.out.println("-----標題-----"); System.out.println("推薦:" + digg); System.out.println("連接:" + titleHref); System.out.println("內容:" + titleText); System.out.println("-----內容-----"); System.out.println("內容:" + contentText); System.out.println("--------------------"); writeStr += "--------------------\n-----標題-----推薦:"+digg+"\n" + titleHref + "\n" + titleText + "\n-----內容-----\n" + contentText + "\n--------------------\n\n\n"; } return writeStr; }
完整代碼ide
package spider; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; import java.util.Set; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.DefaultHttpClient; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; @SuppressWarnings("deprecation") public class CnblogsSpider { // 下載 URL 指向的網頁 @SuppressWarnings("static-access") @Test public void downloadFile() throws Exception { // 模擬HTTP GET請求 String responseBody = doGet(); // 解析數據 String writeStr = unparsedData(responseBody); // 建立新文件 String path = "D:\\testFile\\cnblogs.txt"; PrintWriter printWriter = null; printWriter = new PrintWriter(new FileWriter(new File(path))); printWriter.write(writeStr); printWriter.close(); Map<String, String> args = new HashMap<String, String>(); args.put("CategoryId", "808"); args.put("CategoryType", "\"SiteHome\""); args.put("ItemListActionName", "\"PostList\""); args.put("ParentCategoryId", "0"); args.put("TotalPostCount", "4000"); for (int time = 2; time <= 200; time++) { // 延時,調整參數 Thread.currentThread().sleep(200);// 毫秒 args.put("PageIndex", time + ""); // 模擬JS發送POST請求 String json = doPost(args); // 解析數據 String addWriteStr = ""; addWriteStr += unparsedData(json); // 追加文本 printWriter = new PrintWriter(new FileWriter(path, true)); printWriter.write(addWriteStr); printWriter.close(); } } /** * 模擬HTTP GET請求 * * @return 請求返回的JSON數據 */ public String doGet() throws ClientProtocolException, IOException { // 建立HttpClient實例 HttpClient httpClient = new DefaultHttpClient(); // 建立Get方法實例 HttpUriRequest httpUriRequest = new HttpGet("http://www.cnblogs.com"); // 添加必要的頭信息 httpUriRequest .setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest .setHeader( "Cookie", "這裏的仍是要用本身的Cookie"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1"); httpUriRequest.setHeader("If-Modified-Since", "Wed, 12 Apr 2017 03:10:29 GMT"); HttpResponse response = httpClient.execute(httpUriRequest); String json = ""; HttpEntity entity = response.getEntity(); if (entity != null) { InputStream instreams = entity.getContent(); json = convertStreamToString(instreams); } return json; } /** * 模擬HTTP POST請求 * * @param offset * 參數offset * @param start * 參數start * @return 請求返回的JSON數據 */ public String doPost(Map<String, String> args) throws Exception { HttpClient httpClient = new DefaultHttpClient(); RequestBuilder builder = RequestBuilder.post().setUri( "http://www.cnblogs.com/mvc/AggSite/PostList.aspx"); Set<String> keys = args.keySet(); for (String key : keys) { builder.addParameter(key, args.get(key)); } HttpUriRequest httpUriRequest = builder.build(); // 添加必要的頭信息 httpUriRequest .setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest .setHeader( "Cookie", "這裏的仍是要用本身的Cookie"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1"); httpUriRequest.setHeader("If-Modified-Since", "Wed, 12 Apr 2017 03:10:29 GMT"); HttpResponse response = httpClient.execute(httpUriRequest); String str = ""; HttpEntity entity = response.getEntity(); if (entity != null) { InputStream instreams = entity.getContent(); str = convertStreamToString(instreams); } return str; } public static String convertStreamToString(InputStream is) throws IOException { InputStreamReader ir = new InputStreamReader(is, "UTF8"); BufferedReader reader = new BufferedReader(ir); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "\n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } /** * 根據HTML解析數據 * * @param html * 源HTML * @return 解析後的數據 */ public String unparsedData(String html) { Document doc = Jsoup.parse(html); Elements elements = doc.getElementsByAttributeValue("class", "post_item"); String writeStr = ""; for (Element element : elements) { // 推薦數量 Elements diggs = element.getElementsByAttributeValue("class", "digg"); String digg = diggs.get(0).getElementsByTag("span").text().trim(); Elements postItemBodys = element.getElementsByAttributeValue( "class", "post_item_body"); // 標題 String titleHref = postItemBodys.get(0).getElementsByTag("h3") .get(0).getElementsByAttributeValue("class", "titlelnk") .get(0).attr("href"); String titleText = postItemBodys.get(0).getElementsByTag("h3") .get(0).getElementsByAttributeValue("class", "titlelnk") .get(0).text().trim(); // 摘要 String contentText = postItemBodys.get(0) .getElementsByAttributeValue("class", "post_item_summary") .get(0).text().trim(); System.out.println("--------------------"); System.out.println("-----標題-----"); System.out.println("推薦:" + digg); System.out.println("連接:" + titleHref); System.out.println("內容:" + titleText); System.out.println("-----內容-----"); System.out.println("內容:" + contentText); System.out.println("--------------------"); writeStr += "--------------------\n-----標題-----推薦:" + digg + "\n" + titleHref + "\n" + titleText + "\n-----內容-----\n" + contentText + "\n--------------------\n\n\n"; } return writeStr; } }