設計思路html
關鍵代碼java
public void afterJFinalStart() {
// TODO Auto-generated method stub
super.afterJFinalStart();
QuartzManagerKit qm = new QuartzManagerKit();
qm.initJob();
}node
public void beforeJFinalStop() {web
super.beforeJFinalStop();
//關閉定時任務管理器
QuartzManagerKit qm = new QuartzManagerKit();
qm.shutdown();express
}apache
public class DynamicJob3 implements Job{
private static final ResourceService srv=ResourceService.me;
private static final ArticleService srv_article=ArticleService.me;
public void execute(JobExecutionContext context) throws JobExecutionException {
SpiderResource sr=srv.getone();
if(sr!=null&&sr.getStatus()==0){
srv.updatestatus(sr.getId(), 1);
webCrawler(sr);
srv.updatestatus(sr.getId(), 2);
}
}瀏覽器
後邊:QuartzManagerKit.java 代碼cookie
具體代碼dom
/**
*
*/
package cn.jdou.spider;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.UUID;ide
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import com.jfinal.kit.JsonKit;
import com.jfinal.kit.PropKit;
import cn.jdou.common.model.SpiderArticle;
import cn.jdou.common.model.SpiderResource;
import cn.jdou.spider.article.ArticleService;
import cn.jdou.spider.resource.ResourceService;
/***類描述:
*@author: raifei
*@date: 日期:2018年8月20日 時間:下午5:14:26
*@version 1.0
*/
public class DynamicJob3 implements Job{
private static final ResourceService srv=ResourceService.me;
private static final ArticleService srv_article=ArticleService.me;
public void execute(JobExecutionContext context) throws JobExecutionException {
SpiderResource sr=srv.getone();
if(sr!=null&&sr.getStatus()==0){
srv.updatestatus(sr.getId(), 1);
webCrawler(sr);
srv.updatestatus(sr.getId(), 2);
}
}
public static void webCrawler(SpiderResource sr) {
try {
//獲取全部連接
for(int i=sr.getPagestart();i<=sr.getPageend();i=i+sr.getStepnum()){
System.out.print(i);
String url_=sr.getUrl().replace("{page}", String.valueOf(i));
webCrawler(sr,url_);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void webCrawler(SpiderResource sr,String url) throws IOException {
Document document = Jsoup.connect(url)
//須要加上userAgent才能假裝成瀏覽器而不會被網站屏蔽IP
//(這種作法可能也會被某些網站拉黑IP一段時間,因爲不太穩定究竟是不是代碼的問題,還在測試中...)
.userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
//加上cookie信息
.cookie("auth", "token")
//設置超時
.timeout(30000)
//用get()方式請求網址,也能夠post()方式
.get();
//此處能夠文檔處理
// document = Jsoup.parse(document.toString());
//獲取列表
Elements elements = document.select(sr.getRegions());
//獲取列表循環元素
for (Element tr : elements.select(sr.getList())) {
//循環元素二次篩選,篩選到 a標籤
Elements tds = tr.select(sr.getAhref());
//獲取a標籤,跳轉抓取詳情
String href = tds.attr("abs:href");
//提早捕獲標題嘛
if(sr.getIscatchtittle()==1){
String title=tr.select(sr.getTitle()).text();
}
//
//System.out.print(href);
List<String> imglist=new ArrayList<String>();
//查找縮略圖,查找圖片
if(sr.getIscatchimg()==1){
Elements imgs=tr.select("img");
for (Element element : imgs) {
//獲取每一個img標籤URL "abs:"表示絕對路徑
String imgSrc = element.attr("abs:src");
//String affix_name=imgSrc;
String fileExt = imgSrc.substring(imgSrc.lastIndexOf(".") + 1).toLowerCase();
String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
// 打印URL
System.out.println(imgSrc);
imglist.add(imgpath);
//下載圖片到本地
downImages(PropKit.get("fileservice"), imgSrc,imgpath);
}
webCrawler(href,sr,imglist);
System.out.print(imgs);
}
}
}
public static void webCrawler(String uri,SpiderResource sr,List<String> imgList) {
try {
//獲取整個頁面文件
Document document = Jsoup.connect(uri)
//須要加上userAgent才能假裝成瀏覽器而不會被網站屏蔽IP
//(這種作法可能也會被某些網站拉黑IP一段時間,因爲不太穩定究竟是不是代碼的問題,還在測試中...)
.userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
//加上cookie信息
.cookie("auth", "token")
//設置超時
.timeout(30000)
//用get()方式請求網址,也能夠post()方式
.get();
//此處能夠文檔處理
// document = Jsoup.parse(document.toString());
//獲取文章標題
String source=uri;//文章來源
String title= document.select(sr.getTitle()).text();//文章標題
String author="";//獲取來源
if(!"".equals(sr.getAuthor())){
author=document.select(sr.getAuthor()).text();
}
String details="";
//獲取詳情
Elements elements = document.select(sr.getDetails());
//抓取圖片並替換連接
Elements imgs = elements.select("img[src]");
for (Element img : imgs){
String affix_name=img.attr("abs:src");
String fileExt = affix_name.substring(affix_name.lastIndexOf(".") + 1).toLowerCase();
String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
img.attr("src",PropKit.get("fileservice")+"//"+imgpath);
//下載圖像
downImages(PropKit.get("fileservice"),affix_name,imgpath);
}
details=elements.html();
SpiderArticle sa=new SpiderArticle().setArticlename(title).setCategoryid(sr.getId()).setSource(source).setArticledetails(details).setLitimg(JsonKit.toJson(imgList));
srv_article.save(sa);
//System.out.print(details);
//獲取列表循環元素
} catch (IOException e) {
e.printStackTrace();
}
}
public static void downImages(String filePath, String imgUrl,String newname) {
// 若指定文件夾沒有,則先建立
File dir = new File(filePath);
if (!dir.exists()) {
dir.mkdirs();
}
// 截取圖片文件名
String fileName =imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
try {
// 文件名裏面可能有中文或者空格,因此這裏要進行處理。但空格又會被URLEncoder轉義爲加號
String urlTail = URLEncoder.encode(fileName, "UTF-8");
// 所以要將加號轉化爲UTF-8格式的%20
imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
// 寫出的路徑
File file = new File(filePath + File.separator + ("".equals(newname)?fileName:newname));
try {
// 獲取圖片URL
URL url = new URL(imgUrl);
// 得到鏈接
URLConnection connection = url.openConnection();
// 設置10秒的相應時間
connection.setConnectTimeout(10 * 1000);
// 得到輸入流
InputStream in = connection.getInputStream();
// 得到輸出流
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
// 構建緩衝區
byte[] buf = new byte[1024];
int size;
// 寫入到文件
while (-1 != (size = in.read(buf))) {
out.write(buf, 0, size);
}
out.close();
in.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
QuartzManagerKit.java
package cn.jdou.common.Kit;
//package top.rushpeak.edu03.admin.util;
import static org.quartz.CronScheduleBuilder.cronSchedule;
import static org.quartz.JobBuilder.newJob;
import static org.quartz.TriggerBuilder.newTrigger;
import java.util.List;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.quartz.Job;
import org.quartz.JobDetail;
import org.quartz.JobKey;
import org.quartz.Scheduler;
import org.quartz.SchedulerException;
import org.quartz.Trigger;
import org.quartz.TriggerKey;
import org.quartz.impl.StdSchedulerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.Record;
//import top.rushpeak.edu03.admin.job.DynamicJob;//這個是後面的任務實現類
public class QuartzManagerKit {
private Logger log = LogManager.getLogger(QuartzManagerKit.class);
private Scheduler scheduler = null;
public QuartzManagerKit() {
try {
scheduler = new StdSchedulerFactory().getScheduler();
log.info("初始化調度器 ");
} catch (SchedulerException ex) {
log.error("初始化調度器=> [失敗]:" + ex.getLocalizedMessage());
}
}
//初始化啓動任務
public void initJob(){
List<Record> jobs = Db.find("SELECT * FROM job_manager WHERE 1=1 AND is_enabled = 'Y'");
for(Record job:jobs){
if("Y".equals(job.getStr("is_enabled"))){
String className = job.getStr("clazz");
Class<? extends Job> jobClazz = null;
try {
jobClazz = Class.forName(className).asSubclass(Job.class);
} catch (Exception e) {
System.out.println(className+"沒有繼承job,e=="+e);
log.error(className+"沒有繼承job,e=="+e);
continue;
}
String name = job.getStr("name");
String group = job.getStr("group");
String cronExpression = job.getStr("cron_expression");
this.addJob(name, group, jobClazz, cronExpression);
}
}
this.start();
}
//添加任務
public void addJob(String name, String group, Class<? extends Job> clazz, String cronExpression) {
try {
// 構造任務
JobDetail job = newJob(clazz).withIdentity(name, group).build();
// 構造任務觸發器
Trigger trg = newTrigger().withIdentity(name, group).withSchedule(cronSchedule(cronExpression)).build();
// 將做業添加到調度器
scheduler.scheduleJob(job, trg);
log.info("建立做業=> [做業名稱:" + name + " 做業組:" + group + "] ");
System.out.println("建立做業=> [做業名稱:" + name + " 做業組:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("建立做業=> [做業名稱:" + name + " 做業組:" + group + "]=> [失敗]");
}
}
//移除任務
public void removeJob(String name, String group) {
try {
TriggerKey tk = TriggerKey.triggerKey(name, group);
scheduler.pauseTrigger(tk);// 中止觸發器
scheduler.unscheduleJob(tk);// 移除觸發器
JobKey jobKey = JobKey.jobKey(name, group);
scheduler.deleteJob(jobKey);// 刪除做業
log.info("刪除做業=> [做業名稱:" + name + " 做業組:" + group + "] ");
System.out.println("刪除做業=> [做業名稱:" + name + " 做業組:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("刪除做業=> [做業名稱:" + name + " 做業組:" + group + "]=> [失敗]");
}
}
public void pauseJob(String name, String group) {
try {
JobKey jobKey = JobKey.jobKey(name, group);
scheduler.pauseJob(jobKey);
log.info("暫停做業=> [做業名稱:" + name + " 做業組:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("暫停做業=> [做業名稱:" + name + " 做業組:" + group + "]=> [失敗]");
}
}
public void resumeJob(String name, String group) {
try {
JobKey jobKey = JobKey.jobKey(name, group);
scheduler.resumeJob(jobKey);
log.info("恢復做業=> [做業名稱:" + name + " 做業組:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("恢復做業=> [做業名稱:" + name + " 做業組:" + group + "]=> [失敗]");
}
}
public void modifyTime(String name, String group, String cronExpression) {
try {
TriggerKey tk = TriggerKey.triggerKey(name, group);
// 構造任務觸發器
Trigger trg = newTrigger()
.withIdentity(name, group)
.withSchedule(cronSchedule(cronExpression))
.build();
scheduler.rescheduleJob(tk, trg);
log.info("修改做業觸發時間=> [做業名稱:" + name + " 做業組:" + group + "] ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("修改做業觸發時間=> [做業名稱:" + name + " 做業組:" + group + "]=> [失敗]");
}
}
public void start() {
try {
scheduler.start();
log.info("啓動調度器 ");
System.out.println("啓動調度器 ");
} catch (SchedulerException e) {
e.printStackTrace();
log.error("啓動調度器=> [失敗]");
}
}
public void shutdown() { try { scheduler.shutdown(); log.info("中止調度器 "); System.out.println("中止調度器 "); } catch (SchedulerException e) { e.printStackTrace(); log.error("中止調度器=> [失敗]"); } } }