爬蟲Jsoup

@Test
    public void jsoup1() throws IOException {
        String targetUrl = "https://www.runoob.com/";
        //獲取連接
        Connection connect = Jsoup.connect(targetUrl);
        //僞造請求頭
        connect.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        connect.header("accept-encoding", "gzip, deflate, br");
        connect.header("accept-language", "zh-CN,zh;q=0.9");
        connect.header("cache-control", "max-age=0");
        connect.header("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36");
        //忽略錯誤
        connect.ignoreHttpErrors(true);
        Connection.Response response = connect.method(Connection.Method.GET).execute();
//        System.out.println(response.body());
        Document document = response.parse();
        //獲取爬取道德html的body標籤
        Element body = document.body();
//        System.out.println(body);
        Element quiz = body.getElementById("quiz");
////        System.out.println(quiz);
        Elements li = quiz.getElementsByTag("li");
//        //text()獲取標籤裏的文本
////        for (Element element : li) {
////            System.out.println(element.text());
////        }
        li = body.getElementById("index-nav").getElementsByTag("li");
        for (Element element : li) {
            System.out.println(element.text());
//            System.out.println(element.getElementsByTag("a").attr("href"));
            String href = element.getElementsByTag("a").attr("href");
            System.out.println(href);
            if (!href.contains("//")) {
                continue;
            }
            if (!href.contains("https://")) {
                href = "https:" + href;
            }
            System.out.println(href);
            Connection con = Jsoup.connect(href);
            System.out.println(con.execute().parse().body());
            System.out.println("-------------------------------------------------");
        }

    }
相關文章
相關標籤/搜索