webmagic 二次開發爬蟲爬取網站圖片

時間 2019-12-02

原文原文鏈接

webmagic的是一個無須配置、便於二次開發的爬蟲框架，它提供簡單靈活的API，只需少許代碼便可實現一個爬蟲。html

webmagic的使用文檔：http://webmagic.io/docs/git

webmagic的設計文檔：webmagic的設計機制及原理-如何開發一個Java爬蟲github

1.編寫一個核心的url過濾類web

 1 package com.xwer.spider.main;
 2 
 3 import java.util.List;
 4 
 5 import org.apache.log4j.Logger;
 6 
 7 import us.codecraft.webmagic.Page;
 8 import us.codecraft.webmagic.Site;
 9 import us.codecraft.webmagic.processor.PageProcessor;
10 import us.codecraft.webmagic.utils.UrlUtils;
11 
12 /**
13  * 定製爬蟲邏輯的核心類
14  * @author xwer
15  *
16  */
17 public class MM_Processor implements PageProcessor {
18     private Logger logger = Logger.getLogger(this.getClass());
19     // 部分一：抓取網站的相關配置，包括編碼、抓取間隔、重試次數等
20     private Site site = Site.me().setRetryTimes(5).setSleepTime(1000);
21     // 網頁匹配規則
22     private String urlPattern;
23     public MM_Processor(String startUrl, String urlPattern) {
24         // 設置所屬域
25         this.site = Site.me().setDomain(UrlUtils.getDomain(startUrl));
26         this.urlPattern = urlPattern;
27     }
28 
29     @Override
30     // process是定製爬蟲邏輯的核心接口，在這裏編寫抽取邏輯
31     public void process(Page page) {
32         site.setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)");
33         //圖片抓取規則
34         String imgRegex3 = "http://mm.howkuai.com/wp-content/uploads/20[0-9]{2}[a-z]/[0-9]{2}/[0-9]{2}/[0-9]{1,4}.jpg";
35         // 獲取目標連接 例如 http://www.meizitu.com/a/5535.html
36         List<String> requests = page.getHtml().links().regex(urlPattern).all();
37         logger.info("獲取到的目標連接是: "+requests);
38         logger.info("添加連接( "+requests.size()+" )條到集合");
39                   
40          //將獲取的連接存入到targetRequests中(list集合)
41         page.addTargetRequests(requests);
42         logger.info("隊列中存儲的連接數是: "+page.getResultItems().getAll().size());
43         
44         // 圖片的title,標題名稱,用於設定文件夾的名稱 
45          String imgHostFileName = page.getHtml().xpath("//title/text()").replace("\\p{Punct}", "").toString();
46         logger.info("獲取的標題是"+imgHostFileName);
47         
48         List<String> listProcess = page.getHtml().regex(imgRegex3).all();
49         logger.info("存入的圖片地址: "+listProcess);
50         // 此處將標題一併抓取，以後提取出來做爲文件名
51         listProcess.add(0, imgHostFileName);
52         logger.info("存入的圖片連接數量是: "+listProcess.size());
53         //將獲取到的頁面的數據放到resultItems集合中(map)
54         page.putField("img", listProcess);
55     }
56     @Override
57     public Site getSite() {
58         return site;
59     }
60 }

2.對獲取的結果進行持久化處理apache

 1 package com.xwer.spider.main;
 2 
 3 import java.util.ArrayList;
 4 import java.util.List;
 5 import java.util.Map;
 6 import org.apache.log4j.Logger;
 7 import com.xwer.spider.utils.DownLoadUtils;
 8 import us.codecraft.webmagic.ResultItems;
 9 import us.codecraft.webmagic.Task;
10 import us.codecraft.webmagic.pipeline.Pipeline;
11 
12 /**
13  * 處理
14  * @author xwer
15  *
16  */
17 public class MM_Pipeline implements Pipeline {
18     private Logger logger = Logger.getLogger(this.getClass());
19     private String path;
20 
21     public MM_Pipeline() {
22         setPath("/MM/");
23     }
24 
25     public MM_Pipeline(String path) {
26         setPath(path);
27     }
28 
29     public void setPath(String path) {
30         this.path = path;
31     }
32 
33     // 處理下載的方法
34     @Override
35     public void process(ResultItems resultItems, Task task) {
36         logger.info("到了process" + resultItems);
37         String fileStorePath = this.path;
38         for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
39             if (entry.getValue() instanceof List) {
40                 List<String> list = new ArrayList<String>((List) entry.getValue());    
41                 //取出以前存的網頁的標題,拼接成一個新的文件夾名稱
42                 fileStorePath = new StringBuffer(fileStorePath)
43                                     .append(list.get(0))
44                                     .append("\\").toString();
45                 //遍歷圖片連接list
46                 for (int i = 1; i < list.size(); i++) {
47                     // 獲取文件惟一名字
48                     String realname = DownLoadUtils.subFileName(list.get(i));
49                     String uuidname = DownLoadUtils.generateRandonFileName(realname);
50                     // 這裏經過本身寫的下載工具前抓取到的圖片網址，並放在對應的文件中
51                     try {
52                         DownLoadUtils.download(list.get(i), uuidname, fileStorePath);
53                         logger.info("文件" + uuidname +"已經下載完畢");
54                     } catch (Exception e) {
55                         logger.warn("文件下載異常" + list.get(i));
56                         e.printStackTrace();
57                     }
58                 }
59             }
60             else {
61                 System.out.println(entry.getKey() + ":\t" + entry.getValue());
62             }
63         }
64     }
65 }

3.編寫一個下載圖片的工具類app

 1 package com.xwer.spider.utils;
 2 
 3 import java.io.File;
 4 import java.io.FileOutputStream;
 5 import java.io.InputStream;
 6 import java.io.OutputStream;
 7 import java.net.URL;
 8 import java.net.URLConnection;
 9 import java.util.UUID;
10 
11 /**
12  * 下載相關的工具類
13  * 
14  * @author xwer
15  *
16  */
17 public class DownLoadUtils {
18 
19     /**
20      * 下載圖片工具
21      * 
22      * @param urlString
23      *            圖片連接地址
24      * @param filename
25      *            圖片的文件名字
26      * @param savePath
27      *            圖片保存的路徑
28      * @throws Exception
29      */
30     public static void download(String urlString, String filename, String savePath) throws Exception {
31         // 構造URL
32         URL url = new URL(urlString);
33         // 打開鏈接
34         URLConnection con = url.openConnection();
35         // 設置請求頭
36         con.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)");
37         // 設置請求超時爲5s
38         con.setConnectTimeout(5 * 1000);
39         // 輸入流
40         InputStream is = con.getInputStream();
41 
42         // 1K的數據緩衝
43         byte[] bs = new byte[1024];
44         // 讀取到的數據長度
45         int len;
46         // 輸出的文件流
47         File sf = new File(savePath);
48         if (!sf.exists()) {
49             sf.mkdirs();
50         }
51         OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename);
52         // 開始讀取
53         while ((len = is.read(bs)) != -1) {
54             os.write(bs, 0, len);
55         }
56         // 完畢，關閉全部連接
57         os.close();
58         is.close();
59     }
60 
61     /**
62      * 截取真實文件名
63      * 
64      * @param fileName
65      * @return
66      */
67     public static String subFileName(String fileName) {
68         // 查找最後一個 \出現位置
69         int index = fileName.lastIndexOf("\\");
70         if (index == -1) {
71             return fileName;
72         }
73         return fileName.substring(index + 1);
74     }
75 
76     /**
77      * 得到隨機UUID文件名
78      * 
79      * @param fileName
80      * @return
81      */
82     public static String generateRandonFileName(String fileName) {
83         // 得到擴展名
84         String ext = fileName.substring(fileName.lastIndexOf("."));
85         return UUID.randomUUID().toString().replace("-", "") + ext;
86     }
87 }

4.配置一個日誌的輸出文件(用戶打印日誌)框架

 1 ### direct log messages to stdout ###
 2 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 3 log4j.appender.stdout.Target=System.out
 4 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 5 log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
 6 
 7 ### direct messages to file mylog.log ###
 8 log4j.appender.file=org.apache.log4j.FileAppender
 9 log4j.appender.file.File=c:/mylog3.log
10 log4j.appender.file.layout=org.apache.log4j.PatternLayout
11 log4j.appender.file.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
12 
13 ### set log levels - for more verbose logging change 'info' to 'debug' ###
14 log4j.rootLogger=info, stdout ,file

5.編寫程序的入口類dom

 1 package com.xwer.spider.main;
 2 
 3 import java.util.regex.Pattern;
 4 
 5 import org.junit.Test;
 6 
 7 
 8 
 9 import us.codecraft.webmagic.Spider;
10 import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
11 
12 public class MM_test {
13     public static void main(String[] args) {
14         //圖片的存放路徑,PiPline須要用到
15         String fileStorePath = "D:\\test\\";
16         
17         //過濾網頁的正則  http://www.meizitu.com/a/more_1.html
18         String urlPattern = "http://www.meizitu.com/[a-z]/[0-9]{1,4}.html";
19         //自定義的解析器核心
20         MM_Processor mmSprider = new MM_Processor("http://www.meizitu.com/", urlPattern);
21         
22         //設置一些種子連接
23         String[] urls ={"http://www.meizitu.com/",
24                         "http://www.meizitu.com/a/4221.html",
25                         "http://www.meizitu.com/a/4467.html",
26                         "http://www.meizitu.com/a/5467.html",
27                         "http://www.meizitu.com/a/5065.html",
28                         "http://www.meizitu.com/a/4278.html",
29                         "http://www.meizitu.com/a/699.html",
30                         };
31         //啓動爬蟲
32         Spider.create(mmSprider).addUrl(urls)
33                                 .setScheduler(new FileCacheQueueScheduler("D:\\webmagic\\cach"))
34                                 .addPipeline(new MM_Pipeline(fileStorePath))
35                                 .thread(10)
36                                 .run();
37     }
38     
39     
40