WEBCOLLECTION爬圖片

時間 2019-11-19

標籤 webcollection 圖片简体版

原文原文鏈接

public class DemoImageCrawler extends BreadthCrawler {

    //用於保存圖片的文件夾
    File downloadDir;

    //原子性int，用於生成圖片文件名
    AtomicInteger imageId;

    /**
     * 
     * @param crawlPath 用於維護URL的文件夾
     * @param downloadPath 用於保存圖片的文件夾
     */
    public DemoImageCrawler(String crawlPath, String downloadPath) {
        super(crawlPath, true);
        downloadDir = new File(downloadPath);
        if(!downloadDir.exists()){
            downloadDir.mkdirs();
        }
        computeImageId();
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        //根據http頭中的Content-Type信息來判斷當前資源是網頁仍是圖片
        String contentType = page.getResponse().getContentType();
        if(contentType==null){
            return;
        }else if (contentType.contains("html")) {
            //若是是網頁，則抽取其中包含圖片的URL，放入後續任務
            Elements imgs = page.select("img[src]");
            for (Element img : imgs) {
                String imgSrc = img.attr("abs:src");
                next.add(imgSrc);
            }

        } else if (contentType.startsWith("image")) {
            //若是是圖片，直接下載
            String extensionName=contentType.split("/")[1];
            String imageFileName=imageId.incrementAndGet()+"."+extensionName;
            File imageFile=new File(downloadDir,imageFileName);
            try {
                FileUtils.writeFile(imageFile, page.getContent());
                System.out.println("保存圖片 "+page.getUrl()+" 到 "+imageFile.getAbsolutePath());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }

    }

    public static void main(String[] args) throws Exception {
        DemoImageCrawler demoImageCrawler = new DemoImageCrawler("crawl", "download");
        //添加種子URL
        demoImageCrawler.addSeed("http://***.com/");
        //限定爬取範圍
        demoImageCrawler.addRegex("http://***.com/.*");
        //設置爲斷點爬取，不然每次開啓爬蟲都會從新爬取
        demoImageCrawler.setResumable(true);
        demoImageCrawler.setThreads(30);
        Config.MAX_RECEIVE_SIZE = 1000 * 1000 * 10;
        demoImageCrawler.start(3);
    }

    public void computeImageId(){
        int maxId=-1;
        for(File imageFile:downloadDir.listFiles()){
            String fileName=imageFile.getName();
            String idStr=fileName.split("\\.")[0];
            int id=Integer.valueOf(idStr);
            if(id>maxId){
                maxId=id;
            }
        }
        imageId=new AtomicInteger(maxId);
    }

}

此爲代碼demo，下圖爲jar包，已測試，能夠爬取圖片資源html

所須要的jar包爲java

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。