第一次在掘金髮博客, 感受爽爽的, 簡書和掘金之間我仍是選擇掘金了, 由於掘金纔是開發者的平臺, 簡書大部分仍是做者吧!(我的觀點, 賢者勿噴)php
本次帶來的一個用java寫的爬取吾愛破解網(你們都懂得, 不是什麼不正經的網站哈, 不過也是福利)最新更新的資源, 畢竟此網站一直不定時更新牛×哄哄的資源, 這個就是專門爬取最新分享的資源的 (什麼XX軟件啊, 某馬教程視頻啊....)html
(PS:有木有懂前端(喜歡開發UI)的來指導指導我啊!)
, 這樣只要在微信就能夠直接看到最新的資源了, (不僅是資源哦, 還有連接, 回覆, 連接狀態等等)GetInfo.java
package test;
import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
public class GetInfo {
private static String lastTopic; //上一次查詢最終的帖子的標題, 用來判斷是否解析到上次解析的位置
private static String thisTopic; // 暫時保存此次解析的第一個標題, 最後加到lastTopic中去
@Test
public void getInfo(){
try {
OkHttpClient client = new OkHttpClient();//建立OkHttpClient對象
for (int i = 1; i <= 8; i++) {
//構建請求對象 經過內部類Request.Builder構建
Request request = new Request.Builder()
.url("https://www.52pojie.cn/forum.php?mod=guide&view=newthread&page=" + i)//請求接口。若是須要傳參拼接到接口後面。
.build();
Response response = null;
//發送請求獲得response對象
response = client.newCall(request).execute();
//判斷返回狀態碼
if (response.isSuccessful()) {
String string = response.body().string();
// 查看返回的response頭信息, 實際上用來設置返回的cookie的, 尚未完成
// Headers header = response.headers();
// for (int j = 0; j < header.size(); j++) {
// System.out.println(header.name(i) + "-----" + header.value(i));
// }
// System.out.println(string);
//調用方法解析html文本
ParseHtml parseHtml = new ParseHtml();
List<Item> items = parseHtml.getCurrentPageItems(string, lastTopic);
testInsert(items);
if (i == 1){
thisTopic = items.get(0).getTitle();
}
if (parseHtml.isFind()){
break;
}
}
}
lastTopic = thisTopic;
} catch (Exception e) {
e.printStackTrace();
}
}
public void testInsert(List<Item> items){
try {
InputStream resourceAsStream= Resources.getResourceAsStream("mybatis.xml");
SqlSessionFactory build = new SqlSessionFactoryBuilder().build(resourceAsStream);
SqlSession sqlSession = build.openSession();
int insert = sqlSession.insert("com.mtl.mapper.ItemMapper.insertItems", items);
System.out.println("insert = " + insert);
sqlSession.commit();
sqlSession.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
複製代碼
ParseHtml.java
用來解析html字符串的工具類吧(不過並無設置靜態方法,爲了之後spring管理哈哈
)package test;
import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ParseHtml {
private boolean isFind = false; //用來判斷是否解析到了上次執行的最後一個標題, 結束條件
/** * 獲取當前html頁面的全部item對象 * @param html 當前頁面的html字符串 * @param lastTitle 中止解析的帖子標題 * @return item集合 * @throws IOException okhttp拋出的異常 */
public List<Item> getCurrentPageItems(String html, String lastTitle) throws IOException {
ArrayList<Item> items = new ArrayList<>();
//Jsoup解析html文本獲取Document對象
Document parse = Jsoup.parse(html);
Element body = parse.body();
//經過選擇器獲取到標誌的div而後賦值給item
Element element = body.selectFirst("div#forumnew");
// System.out.println("element = " + element);
Element table = element.nextElementSibling();
Elements tbodys = table.select("tbody");
for (int j = 0; j < tbodys.size(); j++) {
element = tbodys.get(j);
String title = element.selectFirst("a.xst").html();
if (title.equals(lastTitle)){ //若是查找到上次的最後的話題就直接結束並通知前臺找到了標記
isFind = true;
break;
}
Item item = new Item();
item.setTitle(title);
Element tbody = element.selectFirst("tbody");
Elements tds = tbody.select("td");
for (int i = 0; i < tds.size(); i++) {
Element td = tds.get(i);
switch (i){
case 0:
item.setUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
Element span = td.selectFirst("span");
if (span != null)
item.setAuthorityLevel(span.html());
break;
case 1:
item.setPartition(td.selectFirst("a").html());
break;
case 2:
item.setAuther(td.selectFirst("a").html());
item.setPublishTime(td.selectFirst("span").html());
break;
case 3:
item.setReplyNum(td.selectFirst("a").html());
item.setViewNum(td.selectFirst("em").html());
break;
case 4:
item.setLastReplyName(td.selectFirst("a").html());
item.setLastReplyTime(td.selectFirst("em").selectFirst("a").html());
item.setLastReplyUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
break;
}
}
parseLink(item);
items.add(item);
}
return items;
}
/** * 解析item內部的百度雲連接 * @param item item對象 */
private void parseLink(Item item) throws IOException {
if (item.getAuthorityLevel() == null) {
OkHttpClient okHttpClient = new OkHttpClient();
String url = item.getUrl();
Request build = new Request.Builder()
.url(url)
.build();
Response response = okHttpClient.newCall(build).execute();
if (response.isSuccessful()){
String string = response.body().string();
// System.out.println(string);
Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密碼: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
StringBuilder links = new StringBuilder();
StringBuilder pwds = new StringBuilder();
while (matcher.find()){
if (links.indexOf(matcher.group(1)) == -1){
links.append(matcher.group(1)).append(";");
pwds.append(matcher.group(3)).append(";");
}
// System.out.println("match = " + matcher.group(0));
}
if (!links.toString().equals("")){
item.setLinksAndPwdsStr(links.toString() + "#;#" + pwds.toString());
}
}
}
}
/** * 測試須要閱讀權限的連接返回的報文體 爲之後自動登陸獲取連接作準備 * @throws IOException */
@Test
public void testLink() throws IOException {
OkHttpClient okHttpClient = new OkHttpClient();
Request build = new Request.Builder()
.url("https://www.52pojie.cn/thread-719615-1-1.html")
.build();
Response response = okHttpClient.newCall(build).execute();
if (response.isSuccessful()){
String string = response.body().string();
Matcher authLevel = Pattern.compile("抱歉,本帖要求閱讀權限高於 \\d+ 才能瀏覽").matcher(string);
System.out.println(string);
if (authLevel.find()) {
System.out.println("須要權限");
}else {
Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密碼: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
while (matcher.find()){
System.out.println("match = " + matcher.group(1) + "--" + matcher.group(3));
}
}
}
}
public boolean isFind() {
return isFind;
}
public void setFind(boolean find) {
isFind = find;
}
}
複製代碼
Item.java
實體類package com.mtl.pojo;
public class Item {
private String title; //標題
private String url; //連接
private String[] links; //百度雲連接數組
private String[] pwds; //對應百度雲連接密碼
private String linksAndPwdsStr; //百度雲連接數組
private String publishTime; //發表時間
private String authorityLevel; //查看權限
private String partition; //帖子分區
private String auther; //帖子做者
private String replyNum; //回覆數量
private String viewNum; //查看數量
private String lastReplyName; //最後回覆帳戶
private String lastReplyTime; //最後回覆時間
private String lastReplyUrl; //最後回覆連接
private String firstPageReply; //第一頁回覆內容集合
private boolean isNeedReply; //是否須要回覆才能夠獲取下載連接
private int searchLinkTimes; //搜尋連接次數, 以備後期超過閾值不在獲取
public String getLinksAndPwdsStr() {
return linksAndPwdsStr;
}
public void setLinksAndPwdsStr(String linksAndPwdsStr) {
if (linksAndPwdsStr == null || linksAndPwdsStr.equals("")){
links = new String[]{};
pwds = new String[]{};
}else {
String[] split = linksAndPwdsStr.split("#;#");
links = split[0].split(";");
pwds = split[1].split(";");
}
this.linksAndPwdsStr = linksAndPwdsStr;
}
public String[] getLinks() {
return links;
}
public String[] getPwds() {
return pwds;
}
public String getFirstPageReply() {
return firstPageReply;
}
public void setFirstPageReply(String firstPageReply) {
this.firstPageReply = firstPageReply;
}
public boolean isNeedReply() {
return isNeedReply;
}
public void setNeedReply(boolean needReply) {
isNeedReply = needReply;
}
public int getSearchLinkTimes() {
return searchLinkTimes;
}
public void setSearchLinkTimes(int searchLinkTimes) {
this.searchLinkTimes = searchLinkTimes;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getPublishTime() {
return publishTime;
}
public void setPublishTime(String publishTime) {
this.publishTime = publishTime;
}
public String getAuthorityLevel() {
return authorityLevel;
}
public void setAuthorityLevel(String authorityLevel) {
this.authorityLevel = authorityLevel;
}
public String getPartition() {
return partition;
}
public void setPartition(String partition) {
this.partition = partition;
}
public String getAuther() {
return auther;
}
public void setAuther(String auther) {
this.auther = auther;
}
public String getReplyNum() {
return replyNum;
}
public void setReplyNum(String replyNum) {
this.replyNum = replyNum;
}
public String getViewNum() {
return viewNum;
}
public void setViewNum(String viewNum) {
this.viewNum = viewNum;
}
public String getLastReplyName() {
return lastReplyName;
}
public void setLastReplyName(String lastReplyName) {
this.lastReplyName = lastReplyName;
}
public String getLastReplyTime() {
return lastReplyTime;
}
public void setLastReplyTime(String lastReplyTime) {
this.lastReplyTime = lastReplyTime;
}
public String getLastReplyUrl() {
return lastReplyUrl;
}
public void setLastReplyUrl(String lastReplyUrl) {
this.lastReplyUrl = lastReplyUrl;
}
}
複製代碼
因爲數據庫是存儲數組很麻煩, 因此我想了一個折中的辦法, 在實體類上下了手腳, 有興趣的小夥伴能夠看一下前端