常規網頁展現部分通常分爲列表頁和詳情頁,bilibili站是請求後臺api返回json,而後使用js生成網頁,全部主要設計解析方法爲,json解析處理。web
b站後端json返回格式相對n站更具結構化,清晰,可是要注意能用字符類型儘可能用字符串類型,兼容性更好。spring
本人的springboot 是採用多線程定時器,分別定時跑爬蟲數據和下載爬蟲資源,之後會在這裏慢慢列出解決方法。json
b站評論和列表能夠用api請求得出,可是詳情真實地址需不能直獲取都加密,能夠藉助地三處理獲取真實數據http://flvurl.cn/ ,另外B站圖片和視頻均勻簡單防倒鏈,圖片,須要。圖片須要或刪除Referer,這裏還有H5的小技巧,之後慢慢說。視頻須要 Referer:http://www.bilibili.com/後端
SpiderNicoService爲一些數據的入庫處理,SslDownloader爲webmagic獲取https,請求處理。api
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <!--<version>0.7.2</version>--> <version>0.5.2</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency>
package win.raychow.modules.spider.base.processor; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import win.raychow.demo.spider.tool.SslDownloader; import win.raychow.modules.spider.base.dao.SpiderBili; import win.raychow.modules.spider.base.domain.BilibiliReplyModel; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * Created by ray on 2017/7/2. * 爬蟲進程 */ @Service public class BilibiliProcessor implements PageProcessor { //https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=1 private String bashUrl = "https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=" ;// + i //https://www.bilibili.com/video/av11778873/ private String detailUrl = "https://www.bilibili.com/video/av"; //https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=1&type=1&oid=11253064 private String replyUrl = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=1&type=1&oid="; private Logger logger = LoggerFactory.getLogger(this.getClass()); @Value("${spider.bilibili.maxSize}") int maxSize; @Autowired BilibiliPipeLine pipeLine; private Site site = Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(10 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); @Override public Site getSite() { return site; } @Override public void process(Page page){ String pageUrl = page.getUrl().toString(); logger.info(pageUrl); try { if (pageUrl.contains(bashUrl)) { //列表頁 String text = page.getJson().toString(); JSONObject jsonObject = JSONObject.parseObject(text); JSONObject archives = (JSONObject) ((JSONObject) jsonObject.get("data")).get("archives"); List<SpiderBili> list = new ArrayList<>(); List<String> reqList = new ArrayList<>(); for (Map.Entry<String, Object> entry : archives.entrySet()) { SpiderBili tmp = JSON.parseObject(entry.getValue().toString(),SpiderBili.class); list.add(tmp); String reqTmp = replyUrl + tmp.getAid(); reqList.add(reqTmp); } if (list.size() > 0){ page.putField("type", 0); page.putField("data", list); page.addTargetRequests(reqList); } } else if (pageUrl.contains(replyUrl)){ String text = page.getJson().toString(); if (text.length() > 10){ BilibiliReplyModel replyModel = JSON.parseObject(text,BilibiliReplyModel.class); String aid = pageUrl.replace(replyUrl,""); page.putField("type", 1); page.putField("data", text); page.putField("aid", aid); } } else if (pageUrl.contains(detailUrl)) { //詳情頁 } } catch (Exception e){ logger.error("url:" + pageUrl ); } } public void run(){ Spider spider = Spider.create(new BilibiliProcessor()) .setDownloader(new SslDownloader()) .addPipeline(pipeLine); //.addPipeline(new ConsolePipeline());//打印到控制檯 spider.addUrl("https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=32"); for (int i = 1; i < maxSize; i++) { //4000 String tmp = bashUrl + i; spider.addUrl(tmp); } try { spider.run(); } catch (Exception e){ } } }
package win.raychow.modules.spider.base.processor; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import win.raychow.modules.spider.base.dao.SpiderBili; import win.raychow.modules.spider.base.dao.SpiderBiliService; import java.util.List; /** * Created by ray on 2017/6/18. * 爬蟲管道 */ @Service public class BilibiliPipeLine implements Pipeline { @Autowired SpiderBiliService service; @Override public void process(ResultItems resultItems, Task task){ if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表內容 List <SpiderBili> list = resultItems.get("data"); for (SpiderBili obj: list) { service.updateBySpider(obj); } } else if(type == 1){ //評論內容 int aid = Integer.parseInt(resultItems.get("aid")); String string = resultItems.get("data"); JSONObject jsonObject = JSON.parseObject(string); String json = JSON.toJSONString(jsonObject); List<SpiderBili> list = service.findByAid(aid); //以前有回覆主題 if (list.size() > 0){ SpiderBili bili = list.get(0); bili.setReplyJson(json); service.update(bili); } } } } }
原文:http://raychow.linkfun.top/2017/12/08/archives/9_javaSpring/spriderBiliBili/index/springboot