上一篇:Java網絡爬蟲實操(3)html
本篇文章繼續圍繞NetDiscovery框架中pipeline的用法,結合另外一個專門爬圖片的框架PicCrawler,實現圖片的批量下載和信息的存儲。順便介紹一下Vert.X框架中的mongo基礎操做。java
//參考之前的文章,本篇要新增的依賴包
implementation 'io.vertx:vertx-mongo-client:3.5.0'
implementation 'com.cv4j.piccrawler:crawler:1.0.0'
複製代碼
package com.sinkinka.parser;
import com.cv4j.netdiscovery.core.domain.Page;
import com.cv4j.netdiscovery.core.domain.ResultItems;
import com.cv4j.netdiscovery.core.parser.Parser;
import com.cv4j.netdiscovery.core.parser.selector.Selectable;
import java.util.ArrayList;
import java.util.List;
public class GirlParser implements Parser {
@Override
public void process(Page page) {
String xpath = "//div[@class='contLeftA']/ul[@class='artCont cl']/li";
List<Selectable> liList = page.getHtml().xpath(xpath).nodes();
List<String> imgUrlList = new ArrayList<>();
for(Selectable li : liList) {
String imageUrl = li.xpath("//img/@src").get();
imgUrlList.add(imageUrl);
}
ResultItems resultItems = page.getResultItems();
resultItems.put("needDownloadImage", imgUrlList);
}
}
複製代碼
package com.sinkinka.pipeline;
import com.cv4j.netdiscovery.core.domain.ResultItems;
import com.cv4j.netdiscovery.core.pipeline.Pipeline;
import com.cv4j.piccrawler.PicCrawlerClient;
import com.cv4j.piccrawler.download.strategy.FileGenType;
import com.cv4j.piccrawler.download.strategy.FileStrategy;
import java.util.List;
public class SaveGirlImage implements Pipeline {
@Override
public void process(ResultItems resultItems) {
// 1.下載圖片
List<String> urls = resultItems.get("needDownloadImage");
PicCrawlerClient.get()
.timeOut(5000)
.fileStrategy(new FileStrategy() {
@Override
public String filePath() {
return "temp";//保存圖片的文件夾
}
@Override
public String picFormat() {
return "jpg";//保存圖片的格式
}
@Override
public FileGenType genType() {
return FileGenType.AUTO_INCREMENT;//保存圖片的文件名生成規則
}
})
.build()
.autoReferer() //自動設置refer
.downloadPics(urls); //最關鍵的一行代碼,把list扔進去就能夠了
//2. 設置信息給下一個pipeline SaveGirlImageLog使用
resultItems.put("savecount", urls.size());
}
}
複製代碼
package com.sinkinka.pipeline;
import com.cv4j.netdiscovery.core.domain.ResultItems;
import com.cv4j.netdiscovery.core.pipeline.Pipeline;
import io.vertx.core.AsyncResult;
import io.vertx.core.Handler;
import io.vertx.core.json.JsonObject;
import io.vertx.ext.mongo.MongoClient;
import java.util.Date;
public class SaveGirlImageLog implements Pipeline {
private MongoClient mongoClient; //基於vertx的對象
private String collectionName;
public SaveGirlImageLog(MongoClient mongoClient, String collectionName){
this.mongoClient = mongoClient;
this.collectionName = collectionName;
}
@Override
public void process(ResultItems resultItems) {
//設置要保存的數據
JsonObject jsonObject = new JsonObject();
jsonObject.put("savecount", Integer.parseInt(resultItems.get("savecount").toString()));
jsonObject.put("savetime", new Date().getTime());
//1:存儲到mongo數據庫裏
mongoClient.save(collectionName, jsonObject, new Handler<AsyncResult<String>>() {
@Override
public void handle(AsyncResult<String> response) {
if (response.succeeded()) {
System.out.println("save success, new id=" + response.result());
} else {
System.out.println("save failure");
response.cause().printStackTrace();
}
}
});
//2:另外一種lambda表達式的寫法
// mongoClient.save(collectionName, jsonObject, response -> {
// if (response.succeeded()) {
// System.out.println("save success, new id=" + response.result());
// } else {
// System.out.println("save failure");
// response.cause().printStackTrace();
// }
// });
}
}
複製代碼
package com.sinkinka;
import com.cv4j.netdiscovery.core.Spider;
import com.sinkinka.parser.GirlParser;
import com.sinkinka.pipeline.SaveGirlImage;
import com.sinkinka.pipeline.SaveGirlImageLog;
import io.vertx.core.Vertx;
import io.vertx.core.json.JsonObject;
import io.vertx.ext.mongo.MongoClient;
public class GirlSpider {
public static void main(String[] args) {
String url = "http://www.woyaogexing.com/touxiang/nv/2018/586210.html";
//建立一個vertx的mongoClient,SaveLog
MongoClient mongoClient = MongoClient.createShared(Vertx.vertx(), getDatabaseConfig());
Spider.create()
.name("getGirlImage")
.url(url)
.parser(new GirlParser())
.pipeline(new SaveGirlImage())
.pipeline(new SaveGirlImageLog(mongoClient, "SaveLog"))
.run();
}
public static JsonObject getDatabaseConfig() {
JsonObject jsonObject = new JsonObject();
jsonObject.put("connection_string", "mongodb://127.0.0.1:27017");
jsonObject.put("db_name", "test");
// jsonObject.put("username", "");
// jsonObject.put("password", "");
return jsonObject;
}
}
複製代碼
本篇的mongo操做使用的類是:io.vertx.ext.mongo.MongoClient Vert.X的MongoClient提供的方法都是異步非阻塞的,很是靈活: node
利用框架咱們能快捷的實現一個圖片爬蟲程序,本地有開發環境的話,幾分鐘就能搞定。以上例子僅僅是拋磚引玉,你們自由發揮其中的使用場景吧。git
圖片爬蟲框架PicCrawler還有不少強大的用法,你們有興趣能夠去github上詳細瞭解吧。github
下一篇:Java網絡爬蟲實操(5)mongodb