目的: 得到目標背單詞網站中的單詞, 寫了一個簡單的小爬蟲, 使用jdk11html
到此, 思路明確!java
第一步, 把冰箱門...., 串詞了,Sorry!!node
第一步, 調用登錄接口, 拿到sessionid!ios
第二步, 帶着sessionid到單詞列表頁, 拿到body, 轉成Document, 開始"借鑑"單詞!cookie
是否是So easy!session
package com.***; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; import java.io.IOException; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.util.HashMap; /** * @author jqw1122@foxmail.com * @description 爬啊爬 * @date 2/23/2019 17:14 */ public class Crawler { @Test public void crawler() { String loginUrl = "http://www.cikuang.me/login"; String fromBody = "username=jqw1122@foxamil.com&password=qweqwe123"; String wordSetUrl= "http://www.cikuang.me/member/learningset?id=4573"; HttpClient httpClient = HttpClient.newBuilder().build(); HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(loginUrl)) .header("Content-Type","application/x-www-form-urlencoded") .POST(HttpRequest.BodyPublishers.ofString(fromBody)) .build(); httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()) .thenApply(HttpResponse::headers) .thenAccept(headers -> { //cookie有好多 我只要sid啊魂淡! var cookieMap = new HashMap<String, String>(); headers.map().get("set-cookie").forEach(c -> { String[] split = c.split(";"); for (String s : split) { String[] split1 = s.split("="); if (split1.length == 2) cookieMap.put(split1[0], split1[1]); } }); //拿着sid去單詞頁面 String cookie_sid = cookieMap.get("sid"); HttpRequest request2 = HttpRequest.newBuilder() .uri(URI.create(wordSetUrl)) .header("Content-Type","application/x-www-form-urlencoded") .header("Cookie", "sid=" + cookie_sid) .GET() .build(); httpClient.sendAsync(request2, HttpResponse.BodyHandlers.ofString()) .thenApply(HttpResponse::body) .thenAccept(htmlString ->{ //獲取到body轉成Document, 方便借鑑... Document htmlDocument = Jsoup.parse(htmlString); //獲取單詞table id Element wordListTable = htmlDocument.getElementById("wordListTable"); Elements trs = wordListTable.getElementsByTag("tr"); trs.forEach(t -> { Elements tds = t.children(); String en = tds.get(0).child(0).text(); String cn = tds.get(1).text(); System.out.println("單詞---->>> " + en + ":" + cn); }); }).join(); } ).join(); } }
爬蟲2: 目的:獲取KMF中託福-聽力-全部練習題的題目的音頻app
/** * @author jqw1122@foxmail.com * @description * @date 2/23/2019 17:14 */ public class Crawler { @Test public void crawlerKMF() { String mainUrl= "https://toefl.kmf.com"; String mainUrl1= "https://toefl.kmf.com/listen/ets/order/"; String localFilePath = "C:\\kmf_audio\\"; HttpClient httpClient = HttpClient.newBuilder().build(); List<String> detailUrlList = new ArrayList<>(); e:for (int i = 0; i <= 5; i++) { for (int j = 1; j <= 4; j++) { String url = mainUrl1 + i + "/0/" + j; HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(url)) .header("Content-Type","application/x-www-form-urlencoded") .GET() .build(); httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()).thenApply(HttpResponse::body).thenAccept(bodyString -> { Document htmlDocument = Jsoup.parse(bodyString); Elements elements = htmlDocument.getElementsByAttributeValue("class", "check-links js-check-link"); elements.forEach(tagA -> { String href = tagA.attr("href"); detailUrlList.add(href); }); System.out.println("page detail number:" + elements.size()); }).join(); //test // if (1==1) break e; } } System.out.println("page/file number: "+detailUrlList.size()); var fileList = new ArrayList<Map<String, String>>(); System.out.println(LocalTime.now().toString() + " start get audio file url in detail page"); detailUrlList.parallelStream().forEach(href -> { HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(mainUrl + href)) .header("Content-Type","application/x-www-form-urlencoded") .GET() .build(); httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()).thenApply(HttpResponse::body).thenAccept(bodyString -> { Document htmlDocument = Jsoup.parse(bodyString); Elements bts = htmlDocument.getElementsByAttributeValue("class", "i-title js-top-title"); String fileName = bts.get(0).text(); Elements audios = htmlDocument.getElementsByAttributeValue("class", "question-audio-cont js-question-audio g-player-control video-left-content js-player-record"); String fileUrl = audios.get(0).attr("data-url"); fileList.add(Map.of("fileName", fileName.toLowerCase().replace(" ", "_") + ".mp3", "fileUrl", fileUrl)); // System.out.println(fileName+ "--"+fileUrl); }).join(); }); System.out.println(LocalTime.now().toString() + " finish get audio file url in detail page! start downloading files to local!"); fileList.parallelStream().forEach(t -> { try (InputStream ins = new URL(t.get("fileUrl")).openStream()) { Path target = Paths.get(localFilePath, t.get("fileName")); // Files.createDirectories(target.getParent()); Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING); } catch (IOException e) { System.out.println("download failed! fileName:" + t.get("fileName") + " fileUrl:" + t.get("fileUrl")); e.printStackTrace(); } }); System.out.println(LocalTime.now().toString() + " download completed"); } }
下載成功了....ide