將爬取結果保存爲一個Excel表格
官方沒有給出導出Excel 的教程 這裏我就發一個導出爲Excel的教程html
由於我的愛好 我喜歡用Gradle因此這裏就弄Gradle配置java
//爬蟲包 compile group: 'us.codecraft', name: 'webmagic-core', version: '0.7.3' compile group: 'us.codecraft', name: 'webmagic-extension', version: '0.7.3' //poi包 office操做 compile group: 'org.apache.poi', name: 'poi', version: '4.0.1'
maven的話也發一個吧web
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.0.1</version> </dependency> <!-- 爬蟲包 --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
這裏我實現了Pipeline接口這個接口是保存結果的
從這個接口裏進行保存操做
這裏的save()命令加了個synchronized的目的是爲了防止多線程操做的時候出現線程安全的問題apache
下面請看Java代碼安全
import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.*; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileOutputStream; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; /** * 爬蟲爬取結果導出到Excel * extends * FilePersistentBase 文件保存的父類 提供建立目錄等方便的操做 * * implements: * PageProcessor 爬蟲的頁面操做 * Pipeline 爬取的結果操做 */ public class WebmagicAndPoiDemo extends FilePersistentBase implements PageProcessor,Pipeline { public WebmagicAndPoiDemo(){ logger = LoggerFactory.getLogger(getClass()); site = Site.me().setTimeOut(1000).setRetryTimes(3); //設置保存路徑 setPath("G:\\IdeaProjects\\WebMagicDemo\\Temp\\"); filename = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) +".xls"; //建立工做薄對象 workbook = new HSSFWorkbook();//這裏也能夠設置sheet的Name //建立工做表對象 sheet = workbook.createSheet("爬取結果"); //建立工做表的行 HSSFRow row = sheet.createRow(rows); row.createCell(0).setCellValue("id"); row.createCell(1).setCellValue("名稱"); row.createCell(2).setCellValue("鏈接地址"); rows++; } private String filename; private int rows = 0; private HSSFWorkbook workbook; private HSSFSheet sheet; private Site site; private Logger logger; @Override public Site getSite() { return site; } @Override/** 這個是Pipeline的方法 **/ public void process(ResultItems resultItems, Task task) { List<String> hrefs = resultItems.get("href"); List<String> texts = resultItems.get("text"); logger.debug(hrefs.toString()); logger.debug(texts.toString()); for (int i=0;i<hrefs.size();i++){ //建立工做表的行 HSSFRow row = sheet.createRow(rows); row.createCell(0).setCellValue(rows); row.createCell(2).setCellValue(hrefs.get(i)); row.createCell(1).setCellValue(texts.get(i)); rows++; } save(); } /** 保存表格 **/ private synchronized void save() { try { //文檔輸出 FileOutputStream out = new FileOutputStream(getFile(this.path).getPath()+"\\"+filename); workbook.write(out); out.close(); logger.info(this.path+"\\"+filename+"存儲完畢"); } catch (IOException e) { logger.warn("存儲失敗", e); } } @Override/** 這個是PageProcessor的方法 **/ public void process(Page page) { Selectable html = page.getHtml(); Selectable href = html.$(".postTitle2","href"); Selectable text = html.$(".postTitle2","text"); page.putField("href",href.all()); page.putField("text",text.all()); } public static void main(String[] args) { //爬取本身的博客的帖子 WebmagicAndPoiDemo app = new WebmagicAndPoiDemo(); Spider.create(app).addPipeline(app) .addUrl("https://www.cnblogs.com/xiaoshuai123/").thread(1).run(); } }