java爬蟲系列第三講-獲取頁面中絕對路徑的各類方法

在使用webmgiac的過程當中,不少時候咱們須要抓取鏈接的絕對路徑,總結了幾種方法,示例代碼放在最後。php

以和訊網的一個頁面爲例: css

xpath方式獲取

log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").links().all());
log.info("{}", page.getHtml().xpath("//div[@id='cyldata']//a//@abs:href").all());
複製代碼

xpath+css選擇器方式獲取

log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").css("a", "abs:href").all());
複製代碼

css選擇器方式獲取

log.info("{}", page.getHtml().css("div[id='cyldata']").css("a", "abs:href").all());
log.info("{}", page.getHtml().css("div[id='cyldata']").links().all());
log.info("{}", page.getHtml().css("div[id='cyldata'] a").links().all());
log.info("{}", page.getHtml().css("div[id='cyldata'] a", "abs:href").all());
複製代碼

jsoup方式獲取

for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
    log.info("{}", element.attr("abs:href"));
    log.info("{}", element.absUrl("href"));
}
複製代碼

jsoup中stringutil工具類方式獲取

for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
    log.info("{}", StringUtil.resolve(page.getRequest().getUrl(), element.attr("href")));
}
複製代碼

示例代碼

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.4.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.ady01</groupId>
    <artifactId>java-pachong</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>java-pachong</name>
    <description>java爬蟲項目</description>

    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter</artifactId>
        </dependency>

        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>

        <!-- webmagic start -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <artifactId>fastjson</artifactId>
                    <groupId>com.alibaba</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-io</artifactId>
                    <groupId>commons-io</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-io</artifactId>
                    <groupId>commons-io</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>fastjson</artifactId>
                    <groupId>com.alibaba</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>fastjson</artifactId>
                    <groupId>com.alibaba</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>log4j</artifactId>
                    <groupId>log4j</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>slf4j-log4j12</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-selenium</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>net.minidev</groupId>
            <artifactId>json-smart</artifactId>
            <version>2.2.1</version>
        </dependency>
        <!-- webmagic end -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.49</version>
        </dependency>
        <dependency>
            <groupId>commons-lang</groupId>
            <artifactId>commons-lang</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>commons-codec</groupId>
            <artifactId>commons-codec</artifactId>
            <version>1.11</version>
        </dependency>
        <dependency>
            <groupId>commons-collections</groupId>
            <artifactId>commons-collections</artifactId>
            <version>3.2.2</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>

</project>
複製代碼
package com.ady01.demo3;

import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/** * <b>description</b>:webmagic中獲取絕對路徑 <br> * <b>time</b>:2019/4/22 10:42 <br> * <b>author</b>:微信公衆號:路人甲Java,專一於java技術分享(帶你玩轉 爬蟲、分佈式事務、異步消息服務、任務調度、分庫分表、大數據等),喜歡請關注! */
@Slf4j
public class AbsHrefPageProcessor implements PageProcessor {
    Site site = Site.me().setSleepTime(1000);

    @Override
    public void process(Page page) {
        //獲取超連接絕對路徑的方式
        log.info("----------------------xpath方式獲取------------------------");
        //xpath方式獲取
        log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").links().all());
        log.info("{}", page.getHtml().xpath("//div[@id='cyldata']//a//@abs:href").all());

        //xpath+css選擇器方式獲取
        log.info("----------------------xpath+css選擇器方式獲取------------------------");
        log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").css("a", "abs:href").all());

        //css選擇器方式獲取
        log.info("----------------------css選擇器方式獲取------------------------");
        log.info("{}", page.getHtml().css("div[id='cyldata']").css("a", "abs:href").all());
        log.info("{}", page.getHtml().css("div[id='cyldata']").links().all());
        log.info("{}", page.getHtml().css("div[id='cyldata'] a").links().all());
        log.info("{}", page.getHtml().css("div[id='cyldata'] a", "abs:href").all());

        //jsoup方式獲取
        log.info("----------------------jsoup方式獲取------------------------");
        for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
            log.info("{}", element.attr("abs:href"));
            log.info("{}", element.absUrl("href"));
        }

        //jsoup中stringutil工具類方式獲取
        log.info("----------------------jsoup中stringutil工具類方式獲取------------------------");
        for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
            log.info("{}", StringUtil.resolve(page.getRequest().getUrl(), element.attr("href")));
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Request request = new Request("http://industry.hexun.com/c193_59.shtml");
        Spider.create(new AbsHrefPageProcessor()).addRequest(request).run();
    }
}
複製代碼

​執行結果: html

相關文章
相關標籤/搜索