public class DemoImageCrawler extends BreadthCrawler { //用於保存圖片的文件夾 File downloadDir; //原子性int,用於生成圖片文件名 AtomicInteger imageId; /** * * @param crawlPath 用於維護URL的文件夾 * @param downloadPath 用於保存圖片的文件夾 */ public DemoImageCrawler(String crawlPath, String downloadPath) { super(crawlPath, true); downloadDir = new File(downloadPath); if(!downloadDir.exists()){ downloadDir.mkdirs(); } computeImageId(); } @Override public void visit(Page page, CrawlDatums next) { //根據http頭中的Content-Type信息來判斷當前資源是網頁仍是圖片 String contentType = page.getResponse().getContentType(); if(contentType==null){ return; }else if (contentType.contains("html")) { //若是是網頁,則抽取其中包含圖片的URL,放入後續任務 Elements imgs = page.select("img[src]"); for (Element img : imgs) { String imgSrc = img.attr("abs:src"); next.add(imgSrc); } } else if (contentType.startsWith("image")) { //若是是圖片,直接下載 String extensionName=contentType.split("/")[1]; String imageFileName=imageId.incrementAndGet()+"."+extensionName; File imageFile=new File(downloadDir,imageFileName); try { FileUtils.writeFile(imageFile, page.getContent()); System.out.println("保存圖片 "+page.getUrl()+" 到 "+imageFile.getAbsolutePath()); } catch (IOException ex) { throw new RuntimeException(ex); } } } public static void main(String[] args) throws Exception { DemoImageCrawler demoImageCrawler = new DemoImageCrawler("crawl", "download"); //添加種子URL demoImageCrawler.addSeed("http://***.com/"); //限定爬取範圍 demoImageCrawler.addRegex("http://***.com/.*"); //設置爲斷點爬取,不然每次開啓爬蟲都會從新爬取 demoImageCrawler.setResumable(true); demoImageCrawler.setThreads(30); Config.MAX_RECEIVE_SIZE = 1000 * 1000 * 10; demoImageCrawler.start(3); } public void computeImageId(){ int maxId=-1; for(File imageFile:downloadDir.listFiles()){ String fileName=imageFile.getName(); String idStr=fileName.split("\\.")[0]; int id=Integer.valueOf(idStr); if(id>maxId){ maxId=id; } } imageId=new AtomicInteger(maxId); } }
此爲代碼demo,下圖爲jar包,已測試,能夠爬取圖片資源html
所須要的jar包爲java