按照以上幾點的,推薦一款很是好用的java爬蟲框架webmgicjava
使用webgic爬取 愛電影 電影列表資源信息git
示例源碼地址web
<dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter</artifactId> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!-- webmagic start --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <artifactId>fastjson</artifactId> <groupId>com.alibaba</groupId> </exclusion> <exclusion> <artifactId>commons-io</artifactId> <groupId>commons-io</groupId> </exclusion> <exclusion> <artifactId>commons-io</artifactId> <groupId>commons-io</groupId> </exclusion> <exclusion> <artifactId>fastjson</artifactId> <groupId>com.alibaba</groupId> </exclusion> <exclusion> <artifactId>fastjson</artifactId> <groupId>com.alibaba</groupId> </exclusion> <exclusion> <artifactId>log4j</artifactId> <groupId>log4j</groupId> </exclusion> <exclusion> <artifactId>slf4j-log4j12</artifactId> <groupId>org.slf4j</groupId> </exclusion> </exclusions> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-selenium</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>net.minidev</groupId> <artifactId>json-smart</artifactId> <version>2.2.1</version> </dependency> <!-- webmagic end --> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.49</version> </dependency> <dependency> <groupId>commons-lang</groupId> <artifactId>commons-lang</artifactId> <version>2.6</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.6</version> </dependency> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> <version>1.11</version> </dependency> <dependency> <groupId>commons-collections</groupId> <artifactId>commons-collections</artifactId> <version>3.2.2</version> </dependency> </dependencies>
在谷歌瀏覽器中訪問 愛電影動做片列表ajax
F12發現列表頁中數據是經過一個ajax請求獲取的,咱們獲取請求地址spring
http://m.ady01.com/rs/film/listJson/1/2?_=1555726508180json
編寫抓取代碼瀏覽器
package com.ady01.demo1; import lombok.extern.slf4j.Slf4j; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * <b>description</b>:第一個爬蟲示例,爬去動做片列表信息 <br> * <b>time</b>:2019/4/20 10:58 <br> * <b>author</b>:ready likun_557@163.com */ @Slf4j public class Ady01comPageProcessor implements PageProcessor { @Override public void process(Page page) { log.info("爬取成功!"); log.info("爬取的內容:" + page.getRawText()); } @Override public Site getSite() { return Site.me().setSleepTime(1000).setRetryTimes(3); } public static void main(String[] args) { String url = "http://m.ady01.com/rs/film/listJson/1/2?_=1555726508180"; Spider.create(new Ady01comPageProcessor()).addUrl(url).thread(1).run(); } }
運行Ady01comPageProcessor中的main方法,執行結果以下:
springboot