網站:www.cnblogs.com/數據庫
分析:api
須要的數據:標題、摘要、原文地址、發佈時間bash
存儲數據庫app
標題、摘要、原文地址、發佈時間ide
文章表:id主鍵 title標題summary摘要detailurl詳細地址pubtime發佈時間ctime建立時間post
SQL腳本:測試
create database db_data1906;
use db_data1906;
create table t_bkyarticle(id int primary key auto_increment,title varchar(100),summary text,detailurl varchar(200),pubtime date,ctime date);複製代碼
技術棧:SpringBoot網站
1.新建項目ui
SpringBoot編碼
二、依賴jar
三、逐層編寫代碼
實體層
@TableName("t_bkyarticle")
@Data
public class BkyArticle {
@TableId(type = IdType.AUTO)
private Integer id;
private String title;
private String summary;
private String detailurl;
private Date pubtime;
private Date ctime;
}複製代碼
持久層
public interface BkyArticleDao extends BaseMapper<BkyArticle> {
@Insert("insert into t_bkyarticle(title,summary,detailurl,pubtime,ctime) values(#{title},#{summary},#{detailurl},#{pubtime},now())")
int save(BkyArticle article);
}複製代碼
業務邏輯層
public interface BkyArticleService extends IService<BkyArticle> {
boolean saveEntity(BkyArticle article);
}複製代碼
@Service
public class BkyArticleServiceImpl extends ServiceImpl<BkyArticleDao, BkyArticle> implements BkyArticleService {
@Override
public boolean saveEntity(BkyArticle article) {
return getBaseMapper().save(article)>0;
}
}複製代碼
四、編寫爬蟲核心代碼
自定義頁面處理器
@Service
public class BkyArticlePage implements PageProcessor {
private String baseUrl="https://www.cnblogs.com/";
@Override
public void process(Page page) {
//一、解析當前頁面的內容
List<String> titles=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/h3/a/text()").all();
List<String> urls=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/h3/a/@href").all();
List<String> infos=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/p[@class='post_item_summary']/text()").all();
List<String> times=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/div[@class='post_item_foot']/a/text()").all();
//二、組裝解析的結果
List<BkyArticle> articles=new ArrayList<>();
for(int i=0;i<titles.size();i++){
BkyArticle article=new BkyArticle();
article.setTitle(titles.get(i));
article.setSummary(infos.get(i));
article.setDetailurl(urls.get(i));
article.setPubtime(parseTime(getTimeStr(times.get(i))));
articles.add(article);
}
//三、傳遞給告終果處理器
page.putField("list",articles);
//四、分頁查詢 獲取分頁的路徑並標記繼續爬取
if(page.getUrl().get().equals(baseUrl)){
//計算全部的分頁請路徑
List<String> pageurls=new ArrayList<>();
List<String>allpages=page.getHtml().xpath("div[@id='paging_block']/div[@class='pager']/a/text()").all();
int maxPage=Integer.parseInt(allpages.get(allpages.size()-2));
for(int i=2;i<=maxPage;i++){
pageurls.add(baseUrl+"/sitehome/p/"+i);
}
//設置繼續爬取的網頁
page.addTargetRequests(pageurls);
}
}
private String getTimeStr(String s){
String s1=s.trim();
if(s1.indexOf(" ")>0){
return s.substring(s.indexOf(' ')+1);
}else {
return null;
}
}
private Date parseTime(String time){
if(time!=null) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
try {
return sdf.parse(time);
} catch (ParseException e) {
e.printStackTrace();
return new Date();
}
}else {
return new Date();
}
}
private Site site=Site.me().setTimeOut(6000).setSleepTime(2000);
@Override
public Site getSite() {
return site;
}
}複製代碼
結果處理器
@Repository
public class BkyArticPipeline implements Pipeline {
@Autowired
private BkyArticleDao bkyArticleDao;
@Override
public void process(ResultItems resultItems, Task task) {
List<BkyArticle> articleList=resultItems.get("list");
System.out.println("爬取數據:"+articleList.size());
for(BkyArticle a:articleList){
bkyArticleDao.save(a);
}
}
}複製代碼
五、編寫啓動接口
控制器 實現爬取的運行
@Api
@RestController
public class BkyArticController {
@Autowired
private BkyArticleService bkyArticleService;
@Autowired
private BkyArticlePage page;
@Autowired
private BkyArticPipeline pipeline;
//啓動爬蟲
@GetMapping("/api/spider/start.do")
public R start(){
Spider.create(page).addPipeline(pipeline).addUrl("https://www.cnblogs.com/").thread(5).run();
return R.ok("爬取已經啓動");
}
//查詢爬取數據
@GetMapping("api/bkyartic/all.do")
public R all(){
return R.ok(bkyArticleService.list());
}
}複製代碼
六、配置Swagger
@Configuration //配置文件
public class SwaggerConfig {
//建立文檔說明
public ApiInfo createAI(){
ApiInfo apiInfo=new ApiInfoBuilder().title("文章接口").description("實現一款基於爬蟲實現的數據接口").contact(new Contact("Feri","http://www.17feri.top","xingfei_work@163.com")).build();
return apiInfo;
}
//建立Swagger掃描信息
@Bean
public Docket createD(){
return new Docket(DocumentationType.SWAGGER_2).apiInfo(createAI()).select().
apis(RequestHandlerSelectors.basePackage("com.feri.point.controller")).build();
}
}複製代碼
七、啓動測試