點擊上方 java項目開發,選擇 設爲星標css
優質文章,及時送達html
案例功能效果圖
爬去數據的平臺頁面前端
這個案例能爬取的平臺太多了,我沒有所有截圖出來,想看的大家本身下載源碼本身跑起來!vue
爬取的熱榜數據效果圖java
環境介紹
前端:vue+h5node
後端:springboot+webMagicmysql
jdk:1.8及以上react
數據庫:mysqlweb
完整源碼獲取方式
源碼獲取方式正則表達式
掃碼關注回覆【psj】獲取完整源碼
若是你在運行這個代碼的過程當中有遇到問題,請加小編微信xxf960513,我拉你進對應微信學習羣!!幫助你快速掌握這個功能代碼!
核心代碼介紹
pom.xml
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava --> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>18.0</version> </dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 代碼省略工具--> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.8</version> <scope>provided</scope> </dependency>
<!-- https://mvnrepository.com/artifact/junit/junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency>
<!-- swagger2 --> <dependency> <groupId>io.springfox</groupId> <artifactId>springfox-swagger2</artifactId> <version>2.9.1</version> </dependency>
<dependency> <groupId>io.springfox</groupId> <artifactId>springfox-swagger-ui</artifactId> <version>2.9.1</version> </dependency>
application.yml
server: port: 9004spring: jackson: serialization: true : datasource: driverClassName: com.mysql.cj.jdbc.Driver url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC username: feimeidehuoji password: feimeidehuoji jpa: database: MySQL true : hibernate: update : org.hibernate.dialect.MySQL5InnoDBDialect : spiderUrl: https://tophub.todayproxyUrl: 61.160.210.234proxyPort: 808
NodeController.java
package cn.cesi.webMagic.webMagic;import cn.cesi.webMagic.pieline.SpringPieline;import cn.cesi.webMagic.pojo.Node;import cn.cesi.webMagic.service.NodeService;import cn.cesi.webMagic.util.Result;import cn.cesi.webMagic.util.StatusCode;import io.swagger.annotations.Api;import io.swagger.annotations.ApiOperation;import io.swagger.annotations.ApiParam;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.data.domain.Page;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.web.bind.annotation.CrossOrigin;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.RequestParam;import org.springframework.web.bind.annotation.RestController;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.HttpClientDownloader;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.SimpleProxyProvider;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import javax.annotation.Resource;import java.util.List;import java.util.Map;
public class NodeController {
private String url;
private String proxyUrl;
private Integer proxyPort;
NodeService nodeService;
SpringPieline springPieline;
public Result getData( String typeName , String secondTitle , Integer page , Integer size){ Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size); Result result = new Result(); result.setFlag(true); result.setCode(StatusCode.OK); result.setMsg("查詢成功!"); result.setData(nodes); return result; }
public Result getData(){ List<Map<String,String>> list = nodeService.findType(); Result result = new Result(); result.setFlag(true); result.setCode(StatusCode.OK); result.setMsg("查詢成功!"); result.setData(list); return result; } //1000*60*8 任務執行完成後10分鐘繼續執行 public void tasks(){ System.out.println("定時任務開始——————————————————————————————————"); //設置代理服務器 HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort))); Spider.create(new WebProcess()) .addUrl(url) .setDownloader(httpClientDownloader) .thread(2) //線程(程序爬取速度) .addPipeline(springPieline) //指定pieline接口 .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10))) .run();
System.out.println("定時任務結束——————————————————————————————————"); }}
WebProcess.java
package cn.cesi.webMagic.webMagic;import cn.cesi.webMagic.pieline.SpringPieline;import cn.cesi.webMagic.util.NodeEntity;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.stereotype.Component;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.HttpClientDownloader;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.SimpleProxyProvider;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import us.codecraft.webmagic.selector.Selectable;
import org.jsoup.select.Elements;import java.util.*;
@Componentpublic class WebProcess implements PageProcessor {
@Override public void process(Page page) { System.out.println(page.getHtml()); //page頁面對象,getHtml()獲取頁面的html ,css()選擇器 div#Sortable 獲取id爲Sortable的div元素 nodes()轉爲集合 List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();
List<NodeEntity> nodes = new ArrayList<>(); for(Selectable selectable : list){ //regex 正則表達式// String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //標題 //Jsoup.parse解析html爲dom元素(對象)語法同js語法 text()爲js語法很少解釋 //獲取title大標題 String s = selectable.css("div.cc-cd-ih div a div span").toString(); String title = ""; if(s != null){ title = Jsoup.parse(s).text(); } //獲取logo String logo = selectable.css("div.cc-cd-ih div a div img").toString(); String logoSrc = ""; if(logo != null){ Document document = Jsoup.parse(logo); Elements imgTags = document.select("img[src]"); logoSrc = imgTags.attr("src"); }
//獲取第二層小標題的集合 List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes(); List<Map<String,String>> maps = new ArrayList<>(); for(Selectable selectable2 :list2){ Map<String,String> map = new HashMap<>(); //獲取二級標題的連接 String url = selectable2.links().toString(); //獲取二級標題 String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text(); //獲取文章熱度 String hot = ""; if(selectable2.css("div span.e") != null){ hot = Jsoup.parse(selectable2.css("div span.e").toString()).text(); }
map.put("url",url); map.put("secondTitle",secondTitle); map.put("hot",hot); maps.add(map);
//將鏈接添加入任務中 //page.addTargetRequest(url); } NodeEntity node = new NodeEntity(); node.setTitle(title); node.setLogo(logoSrc); node.setMaps(maps); nodes.add(node); }
//給page對象綁定對象 page.putField("nodes",nodes);
}
private Site site = Site.me() .setSleepTime(2)//抓取間隔時間,能夠解決一些反爬限制 .setRetryTimes(3) //重試次數 .setRetrySleepTime(10000) //重試時間 .setTimeOut(60000) //超時時間 1000*60 1分鐘 .setCharset("utf8"); @Override public Site getSite() { return site; }}
SpringPieline.java
package cn.cesi.webMagic.pieline;import cn.cesi.webMagic.pojo.Node;import cn.cesi.webMagic.service.NodeService;import cn.cesi.webMagic.util.IdWorker;import cn.cesi.webMagic.util.NodeEntity;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.stereotype.Component;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.*;
//存入數據庫@Componentpublic class SpringPieline implements Pipeline { @Autowired NodeService nodeService;
@Autowired IdWorker idWorker;
@Override public void process(ResultItems resultItems, Task task) { List<NodeEntity> nodes = resultItems.get("nodes"); try{ for(NodeEntity entity : nodes){ Node node = new Node(); String title = entity.getTitle(); node.setTitle(title); String logo = entity.getLogo(); node.setLogo(logo); List<Map<String,String>> list = entity.getMaps(); for(Map<String,String> map : list){ node.setId(idWorker.nextId()+""); String secondTitle = map.get("secondTitle").trim(); node.setSecondTitle(secondTitle); node.setUrl(map.get("url")); node.setCreateDate(new Date()); node.setHot(map.get("hot")); System.out.println(secondTitle); if(!secondTitle.equals("") && !title.equals("")){ List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle); if(byTitleAndSecondTitle.size() <= 0){ nodeService.save(node); } }
} } }catch (Exception e){ System.out.println(e); }
}}
index.vue
<template> <div class="tab__content"> <h1 class="page__title">摸魚熱榜</h1> <van-search v-model="value" placeholder="請輸入搜索關鍵詞" @search="onSearch" @clear="onClear" />
<!-- 分類列表 --> <div v-if="!listData.length"> <div class="tab__tips"> 仿今日熱榜!,關注java項目開發,學習更多案例! </div> <div class="cells-block"> <div> <div class="cells__title">所有熱榜</div> <div class="cells"> <div v-for="(item, index) in typeList" :key="index" class="cell-row" > <div class="cell" @click="goDateils(item)"> <div class="cell__hd"> <img :src="item.logo" :alt="item.title" @error="imgError(item)" /> </div> <div class="cell__bd">{{ item.title }}</div> <div class="cell__ft"> <svg-icon iconClass="index_right" className="icon_search" ></svg-icon> </div> </div> </div> </div> </div> </div> </div>
<!-- 搜索內容 --> <div v-if="listData.length"> <search-list v-if="listData.length" :list="listData" /> <van-empty v-else description="暫無相關內容!" /> </div> </div></template>
<script>import SvgIcon from '@/components/icon/SvgIcon';import searchList from '@/components/searchList/list';export default { components: { SvgIcon, searchList }, data() { return { value: '', // 搜索值 listData: [], // 搜索數據 typeList: [], // 全部熱榜類型 defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默認類型圖片 }; }, computed: {}, created() { this.getAllType(); }, mounted() {}, methods: { // 獲取所有熱榜類型 getAllType() { const that = this; this.$api.getAllType().then(res => { if (res.code === 0) { that.typeList = res.data; } }); },
// 跳轉分類詳情 goDateils(item) { this.$router.push({ name: 'details', query: { item: JSON.stringify(item) } }); },
// 搜索 onSearch(e) { const that = this; let params = { typeName: '所有', size: 10000, secondTitle: e }; this.$api.getAllInfoGzip(params).then(res => { if (res.code == 0) { that.listData = res.data.content; that.handleData(that.listData); console.log(res); } }); },
// 清除搜索框 onClear(e) { this.listData = []; },
// 處理熱榜類型數據 handleData(data) { data.forEach(item => { item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是不是新信息 item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time; }); },
// 圖片404處理 imgError(item) { // 圖片404就賦值默認圖片 item.logo = this.defaultUrl; } }};
details.vue
<template> <div class="topic-list"> <div class="info-top"> <img class="info-bg" :src="details.logo" @error="imgError" alt="" /> <div class="info-content"> <div class="top-column"> <p @click="$router.push('/')">摸魚熱榜</p> </div> <img class="pic-icon" :src="details.logo" @error="imgError" alt="" /> <h1 class="info-title">{{ details.title }}</h1> </div> </div> <div class="divider"> <van-pull-refresh v-model="refreshing" @refresh="onRefresh"> <van-list v-model="loading" :finished="finished" @load="onLoad" :immediate-check="false" > <div class="panel_bd"> <a v-for="(item, index) in listData" :key="item.id" :href="item.url" class="media-box van-hairline--bottom" > <div class="media-box__bd"> <h4 class="media-box__title"> {{ index + 1 }}、{{ item.secondTitle }} </h4> <div class="dec-row"> <span class="tag" v-if="item.hot"> <span>{{ item.hot }}</span> </span> <span class="time"> <span>{{ item.CreateTime }}</span> </span> <span class="new" v-if="item.new">新</span> </div> </div> </a> </div> </van-list> </van-pull-refresh> </div> <div class="footer-flag flex-center" v-if="finished"> <p class="flex-center">我是有底線的</p> </div> </div></template>
<script>export default { data() { return { page: 1, // 當前頁數 refreshing: false, // 下拉刷新狀態 loading: false, // 上拉加載狀態 finished: false, // 是否無更多數據狀態 listData: [], // 數據列表 details: {}, // 類型詳情 defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默認類型圖片 }; }, computed: {}, created() {}, mounted() { this.details = JSON.parse(this.$route.query.item); this.getList(this.details, this.page); }, methods: { // 分類詳情 getList(item, page, loading = true) { const that = this; let list = that.listData; let params = { typeName: item.title, size: 50, page }; this.$api.getAllInfoGzip(params, loading).then(res => { console.log(res); if (res.code == 0) { that.listData = list.concat(res.data.content); that.handleData(that.listData); // 上拉加載狀態結束 if (that.loading) { that.loading = false; } // 下拉刷新狀態結束 if (that.refreshing) { that.refreshing = false; } // 暫無更多數據 if (that.page >= res.data.totalPages) { that.finished = true; } } }); },
// 上拉加載 onLoad() { // 請求狀態 this.loading = true; this.getList(this.details, ++this.page, false); },
// 下拉刷新 onRefresh() { // 請求狀態、清空列表數據 this.finished = false; this.loading = true; this.listData = []; this.page = 1; this.getList(this.details, 1, false); },
// 處理熱榜類型數據 handleData(data) { data.forEach(item => { item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是不是新信息 item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time; }); },
// 圖片404處理 imgError() { // 圖片404就賦值默認圖片 this.details.img = this.defaultUrl; } }};
xxx.sql
SET NAMES utf8mb4;SET FOREIGN_KEY_CHECKS = 0;
-- ------------------------------ Table structure for node-- ----------------------------DROP TABLE IF EXISTS `node`;CREATE TABLE `node` ( `id` varchar(255) NOT NULL, `create_date` datetime DEFAULT NULL, `hot` varchar(1024) DEFAULT NULL, `second_title` longtext, `title` varchar(1024) DEFAULT NULL, `url` longtext, `logo` varchar(1024) DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
推薦案例
溫暖提示
請長按識別二維碼
想學習更多的java功能案例請關注
Java項目開發
若是你以爲這個案例以及咱們的分享思路不錯,對你有幫助,請分享給身邊更多須要學習的朋友。別忘了《留言+點在看》給做者一個鼓勵哦!
本文分享自微信公衆號 - web項目開發(javawebkaifa)。
若有侵權,請聯繫 support@oschina.cn 刪除。
本文參與「OSC源創計劃」,歡迎正在閱讀的你也加入,一塊兒分享。