常規網頁展現部分通常分爲列表頁和詳情頁,Tumblr站是請求後臺api返回json,例如整站搜索接口分頁請求,參數加密暫時無解,可是對應blog的列表,可採用不需加密接口,因此仍是有辦法作對應blog列表的爬蟲處理。java
Tumblr站後端json返回格式各類亂,有json內含有html字符串,格式化時要注意容錯,各類坑。web
還有一個簡單去重處理,每一個列表信息均作入庫處理,但下載時會用視頻封面作一個md5比較去重處理,重複則不下載當前視頻,而且使重複視頻數據進行入庫,(下期會細說不一樣來用視頻下載區別,以及彙總處理)。spring
本人的springboot 是採用多線程定時器,分別定時跑爬蟲數據和下載爬蟲資源,之後會在這裏慢慢列出解決方法。json
SpiderTumblrService爲一些數據的入庫處理,SslDownloader爲webmagic獲取https,請求處理。後端
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <!--<version>0.7.2</version>--> <version>0.5.2</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency>
package win.raychow.modules.spider.base.processor; import com.alibaba.fastjson.JSON; import org.json.XML; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import win.raychow.core.base.dao.CacheKey; import win.raychow.core.base.service.HtmlTool; import win.raychow.demo.spider.tool.SslDownloader; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.domain.TumblrRecModel; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Created by ray on 2017/11/19. */ @Service public class TumblrProcessor implements PageProcessor { private Logger logger = LoggerFactory.getLogger(this.getClass()); @Autowired TumblrPipeLine pipeLine; @Value("${spider.tumblr.prefixSexList}") private String prefixSexList; @Value("${spider.tumblr.prefixAnimalList}") private String prefixAnimalList; public final static String bashUrl = ".tumblr.com/api/read?type=video&num=20&start="; private String getCategory(String url){ //性 String[] sexList = prefixSexList.split(CacheKey.Split); for (String id: sexList) { if (url.contains(id)){ return SpiderTumblr.Category_AV; } } //動物 String[] animalList = prefixAnimalList.split(CacheKey.Split); for (String id: animalList) { if (url.contains(id)){ return SpiderTumblr.Category_Animal; } } return SpiderTumblr.Category_Null; } @Override public Site getSite() { //HttpHost httpHost = new HttpHost("127.0.0.1",1087); Site site = Site.me() //.setHttpProxy(httpHost) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(30 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); return site; } @Override public void process(Page page){ String pageUrl = page.getUrl().toString(); logger.info(pageUrl); if (pageUrl.contains(bashUrl)) { try { String xml = page.getJson().toString(); String json = XML.toJSONObject(xml).toString(); TumblrRecModel rec = JSON.parseObject(json, TumblrRecModel.class); List<TumblrRecModel.Post> posts = rec.getTumblr().getPosts().getPost(); List<SpiderTumblr> list = new ArrayList<>(); TumblrRecModel.Tumblelog tumblelog = rec.getTumblr().getTumblelog(); //增長請求 if (pageUrl.contains("1&fffff=0")) { List<String> requestUrls =new ArrayList<>(); long total = Long.valueOf(rec.getTumblr().getPosts().getTotal()); long pageMax = total / 20 + 1; for (int j = 1; j < pageMax; j++) { String tmpUrl = pageUrl.replace("1&fffff=0",String.valueOf(20*j)) ; requestUrls.add(tmpUrl); } page.addTargetRequests(requestUrls); } if (posts.size() == 0) return; for (int i = 0; i < posts.size(); i++) { String str = ""; try { TumblrRecModel.Post post = posts.get(i); str = post.getVideoPlayer().get(0); str = str.replace("\"","'"); String id = HtmlTool.match(str,"video","id").get(0); String poster = HtmlTool.match(str,"video","poster").get(0); String optionsJson = HtmlTool.match(str,"video","data-crt-options").get(0); TumblrRecModel.Options optionsRec = JSON.parseObject(optionsJson,TumblrRecModel.Options.class); String file = HtmlTool.match(str,"source","src").get(0); //類型 String type = ""; if (str.toLowerCase().contains("video/mp4")){ type = "mp4"; } else if (str.toLowerCase().contains("video/ogg")){ type = "ogg"; } else if (str.toLowerCase().contains("video/webm")){ type = "webm"; } String category = pageUrl.split("&ggggg=")[1].toLowerCase(); if (optionsRec.getHdUrl().length() > 10){ file = optionsRec.getHdUrl(); } //String type = post.getVideoSource().getExtension(); String videoCaption = HtmlTool.removeHtmlTag(post.getVideoCaption()); String videoId = "tumblr_" + post.getUrl().substring(post.getUrl().lastIndexOf("/")).substring(1); SpiderTumblr tumblr = new SpiderTumblr(); tumblr.setVideoId(videoId); tumblr.setPosterImage(poster); tumblr.setVideoImage(optionsRec.getFilmstrip().getUrl()); tumblr.setVideoUrl(file); tumblr.setVideoType(type); tumblr.setTitle(videoCaption); tumblr.setBaseUrl(post.getUrl()); tumblr.setCategory(category); tumblr.setBlogTitle(tumblelog.getTitle()); list.add(tumblr); } catch (Exception e){ logger.error("xml to data error :" + str ); } } if (list.size() > 0){ page.putField("type", 0); page.putField("data", list); } } catch (Exception e){ logger.error("url:" + pageUrl ); } } } public void run(){ Spider spider = Spider.create(new TumblrProcessor()) //.setDownloader(new HttpClientDownloader()) //.setDownloader(new HttpDownloader()) .setDownloader(new SslDownloader()) //.addPipeline(new ConsolePipeline())//打印到控制檯 .addPipeline(pipeLine); //animal String[] animalIds = prefixAnimalList.split(CacheKey.Split); for (String prefix:animalIds) { String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix); spider.addUrl(tmpUrl); } //sex String[] sexIds = prefixSexList.split(CacheKey.Split); for (String prefix:sexIds) { String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix); spider.addUrl(tmpUrl); } spider.run(); } }
package win.raychow.modules.spider.base.processor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.dao.SpiderTumblrService; import java.util.List; /** * Created by ray on 2017/11/19. */ @Service public class TumblrPipeLine implements Pipeline { @Autowired SpiderTumblrService service; @Override public void process(ResultItems resultItems, Task task){ if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表內容 List<SpiderTumblr> list = resultItems.get("data"); for (SpiderTumblr tumblr: list) { try { String blogName = tumblr.getBaseUrl().replace("https://","").replace("http://","").split("\\.")[0]; tumblr.setBlogName(blogName); } catch (Exception e){ } service.updateBySpider(tumblr); } } else if(type == 1){ } } } }
原文:http://raychow.linkfun.top/2017/12/15/archives/9_javaSpring/spriderTumblr/index/api