爬蟲順序html
1.分析網站網絡請求java
經過瀏覽器F12開發者工具查看網站的內容獲取方式。node
2.模擬HTTP請求,獲取網頁內容。正則表達式
能夠採用HttpClient,利用JAVA HttpClient工具能夠模擬HTTP GET、POST請求,能夠用來獲取爬蟲須要的數據。JAVA的一些爬蟲框架底層用到的獲取網頁方式也都是HttpClient。apache
3.解析網頁HTML內容,獲取可用數據和下一條請求連接。json
能夠採用jsoup、正則表達式、xpath等。數組
實踐一:知乎瀏覽器
查看開發者工具能夠看到知乎首頁的內容獲取有兩種:網絡
一種是GET請求,請求地址爲https://www.zhihu.com/app
一種是POST請求,請求地址爲https://www.zhihu.com/node/TopStory2FeedList
第一種GET請求即現實中用戶直接從瀏覽器地址欄輸入知乎的網址或點擊連接進行請求,這時知乎會響應返回一個只有數條內容的首頁給用戶。
第二種POST請求即現實中用戶向下滾動頁面,瀏覽器持續加載新內容。
第一種GET請求沒有參數,響應也是HTML,較爲簡單。
第二種POST請求能夠在開發者工具中查看它的參數和響應。
能夠看到有兩個請求參數
params:"{"offset":21,"start":"19"}"
method:"next"
響應爲一段JSON,咱們要的是下面的msg數組,因此代碼中會用到json-lib這個jar包方便咱們解析json。
分析完網站的網絡請求後就能夠進行下一步,模擬HTTP請求
首先模擬GET請求
public String doGet() throws ClientProtocolException, IOException { String str = ""; // 建立HttpClient實例 HttpClient httpClient = new DefaultHttpClient(); // 建立Get方法實例 HttpUriRequest httpUriRequest = new HttpGet("http://www.zhihu.com"); // 添加必要的頭信息 httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest.setHeader("Cookie", "這裏的Cookie拷貝複製登陸後請求頭裏的Cookie值"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1"); httpUriRequest.setHeader("Cache-Control", "max-age=0"); HttpResponse response = httpClient.execute(httpUriRequest); HttpEntity entity = response.getEntity(); if (entity != null) { InputStream inputStream = entity.getContent(); str = convertStreamToString(inputStream); } return str; }
convertStreamToString爲一個將流轉換爲字符串的方法
public static String convertStreamToString(InputStream is) throws IOException { InputStreamReader ir = new InputStreamReader(is, "UTF8"); BufferedReader reader = new BufferedReader(ir); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "\n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); }
模擬POST請求(兩個參數即爲請求參數裏的兩個變量)
public String doPost(int offset, int start) throws Exception { HttpClient httpClient = new DefaultHttpClient(); HttpUriRequest httpUriRequest = RequestBuilder .post() .setUri("https://www.zhihu.com/node/TopStory2FeedList") .addParameter("params", "{\"offset\":" + offset + ",\"start\":\"" + start + "\"}").addParameter("method", "next").build(); // 添加必要的頭信息 httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest.setHeader("X-Xsrftoken", "這裏的X-Xsrftoken拷貝複製登陸後請求頭裏的X-Xsrftoken值"); httpUriRequest.setHeader("X-Requested-With", "XMLHttpRequest"); httpUriRequest.setHeader("Referer", "https://www.zhihu.com/"); httpUriRequest.setHeader("Cookie", "這裏的Cookie拷貝複製登陸後請求頭裏的Cookie值"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Cache-Control", "max-age=0"); HttpResponse response = httpClient.execute(httpUriRequest); String str = ""; HttpEntity entity = response.getEntity(); if (entity != null) { InputStream instreams = entity.getContent(); str = convertStreamToString(instreams); } return str; }
最後走一波main方法將數據保存至TXT文件中,在這以前要提取一下HTML中的數據
根據HTML解析數據
這裏用到的Document Elements Element 都是jsoup裏的元素
這段代碼首先拿到到類名爲feed-item-inner的HTML元素
變量全部feed-item-inner拿到類名爲feed-title的標題和標籤類型爲textarea的內容
public String unparsedData(String html) { Document doc = Jsoup.parse(html); Elements feeds = doc.getElementsByAttributeValue("class", "feed-item-inner"); String writeStr = ""; for (Element feed : feeds) { Elements title = new Elements(); Elements feedTitles = feed.getElementsByAttributeValue("class", "feed-title"); for (Element feedTitle : feedTitles) { title = feedTitle.getElementsByTag("a"); } Elements content = feed.getElementsByTag("textarea"); String titleHref = title.attr("href"); String titleText = title.text().trim(); String contentText = content.text().trim(); // if(!titleText.contains("人民的名義")){ // continue; // } System.out.println("--------------------"); System.out.println("-----標題-----"); System.out.println("連接:" + titleHref); System.out.println("內容:" + titleText); System.out.println("-----內容-----"); System.out.println("內容:" + contentText); System.out.println("--------------------"); writeStr += "--------------------\n-----標題-----\n" + titleHref + "\n" + titleText + "\n-----內容-----\n" + contentText + "\n--------------------\n\n\n"; } return writeStr; }
最後Main方法
public void downloadFile() throws Exception { // 模擬HTTP GET請求 String responseBody = doGet(); // 解析數據 String writeStr = unparsedData(responseBody); // 建立新文件 String path = "D:\\testFile\\zhihu.txt"; PrintWriter printWriter = null; printWriter = new PrintWriter(new FileWriter(new File(path))); // 寫內容 printWriter.write(writeStr); printWriter.close(); int offset = 10; int start = 9; for (int time = 0; time <= 100; time++) { // 模擬POST請求 JSONObject jsonObject = JSONObject.fromObject(doPost(offset, start)); // 解析數據(只拿JSON數據裏的msg數組) String addWriteStr = ""; JSONArray jsonArray = jsonObject.getJSONArray("msg"); Object[] arrays = jsonArray.toArray(); for (Object array : arrays) { addWriteStr += unparsedData(array.toString()); } // 追加文本 printWriter = new PrintWriter(new FileWriter(path, true)); printWriter.write(addWriteStr); printWriter.close(); // 延時,調整參數 Thread.currentThread().sleep(1000);// 毫秒 offset = offset + 10; start = start + 10; } }
完整代碼
package spider; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.DefaultHttpClient; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; @SuppressWarnings("deprecation") public class ZhihuSpider { /** * 模擬HTTP GET請求 */ public String doGet() throws ClientProtocolException, IOException { String str = ""; // 建立HttpClient實例 HttpClient httpClient = new DefaultHttpClient(); // 建立Get方法實例 HttpUriRequest httpUriRequest = new HttpGet("http://www.zhihu.com"); // 添加必要的頭信息 httpUriRequest .setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest .setHeader( "Cookie", "這裏的Cookie拷貝複製登陸後請求頭裏的Cookie值"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1"); httpUriRequest.setHeader("Cache-Control", "max-age=0"); HttpResponse response = httpClient.execute(httpUriRequest); HttpEntity entity = response.getEntity(); if (entity != null) { InputStream inputStream = entity.getContent(); str = convertStreamToString(inputStream); } return str; } public static String convertStreamToString(InputStream is) throws IOException { InputStreamReader ir = new InputStreamReader(is, "UTF8"); BufferedReader reader = new BufferedReader(ir); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "\n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } // 下載 URL 指向的網頁 @SuppressWarnings("static-access") @Test public void downloadFile() throws Exception { // 模擬HTTP GET請求 String responseBody = doGet(); // 解析數據 String writeStr = unparsedData(responseBody); // 建立新文件 String path = "D:\\testFile\\zhihu.txt"; PrintWriter printWriter = null; printWriter = new PrintWriter(new FileWriter(new File(path))); // 寫內容 printWriter.write(writeStr); printWriter.close(); int offset = 10; int start = 9; for (int time = 0; time <= 100; time++) { // 模擬POST請求 JSONObject jsonObject = JSONObject .fromObject(doPost(offset, start)); // 解析數據(只拿JSON數據裏的msg數組) String addWriteStr = ""; JSONArray jsonArray = jsonObject.getJSONArray("msg"); Object[] arrays = jsonArray.toArray(); for (Object array : arrays) { addWriteStr += unparsedData(array.toString()); } // 追加文本 printWriter = new PrintWriter(new FileWriter(path, true)); printWriter.write(addWriteStr); printWriter.close(); // 延時,調整參數 Thread.currentThread().sleep(1000);// 毫秒 offset = offset + 10; start = start + 10; } } /** * 根據HTML解析數據 * * @param html * 源HTML * @return 解析後的數據 */ public String unparsedData(String html) { Document doc = Jsoup.parse(html); Elements feeds = doc.getElementsByAttributeValue("class", "feed-item-inner"); String writeStr = ""; for (Element feed : feeds) { Elements title = new Elements(); Elements feedTitles = feed.getElementsByAttributeValue("class", "feed-title"); for (Element feedTitle : feedTitles) { title = feedTitle.getElementsByTag("a"); } Elements content = feed.getElementsByTag("textarea"); String titleHref = title.attr("href"); String titleText = title.text().trim(); String contentText = content.text().trim(); // if(!titleText.contains("人民的名義")){ // continue; // } System.out.println("--------------------"); System.out.println("-----標題-----"); System.out.println("連接:" + titleHref); System.out.println("內容:" + titleText); System.out.println("-----內容-----"); System.out.println("內容:" + contentText); System.out.println("--------------------"); writeStr += "--------------------\n-----標題-----\n" + titleHref + "\n" + titleText + "\n-----內容-----\n" + contentText + "\n--------------------\n\n\n"; } return writeStr; } /** * 模擬HTTP POST請求 * * @param offset * 參數offset * @param start * 參數start * @return 請求返回的JSON數據 */ public String doPost(int offset, int start) throws Exception { HttpClient httpClient = new DefaultHttpClient(); HttpUriRequest httpUriRequest = RequestBuilder .post() .setUri("https://www.zhihu.com/node/TopStory2FeedList") .addParameter( "params", "{\"offset\":" + offset + ",\"start\":\"" + start + "\"}").addParameter("method", "next").build(); // 添加必要的頭信息 httpUriRequest .setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"); httpUriRequest.setHeader("X-Xsrftoken", "這裏的X-Xsrftoken拷貝複製登陸後請求頭裏的X-Xsrftoken值"); httpUriRequest.setHeader("X-Requested-With", "XMLHttpRequest"); httpUriRequest.setHeader("Referer", "https://www.zhihu.com/"); httpUriRequest .setHeader( "Cookie", "這裏的Cookie拷貝複製登陸後請求頭裏的Cookie值"); httpUriRequest.setHeader("DNT", "1"); httpUriRequest.setHeader("Connection", "keep-alive"); httpUriRequest.setHeader("Cache-Control", "max-age=0"); HttpResponse response = httpClient.execute(httpUriRequest); String str = ""; HttpEntity entity = response.getEntity(); if (entity != null) { InputStream instreams = entity.getContent(); str = convertStreamToString(instreams); } return str; } }