本次更新主要解決了老舊頁面下載連接多是迅雷和ftp格式的,還有就是去重,由於每一頁有一個推薦列表,裏面也會有相應的詳情連接,還有兼容了另外的頁面格式,更新了兩個方法:java
public static void spider(int pa) { List<String> page = getPage(pa); String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", "); List<String> list = Arrays.asList(abc); page.removeAll(list); output(page.size()); Set<String> truelist = new HashSet<>(); page.forEach(l -> truelist.add(l)); truelist.forEach(p -> { try { getMovieInfo(p); sleep(getRandomInt(3) + 3); } catch (Exception e) { output(p); } }); } public static void spider(String text) { List<String> page = getPage(text); Set<String> truelist = new HashSet<>(); page.forEach(l -> truelist.add(l)); truelist.forEach(p -> { try { getMovieInfo(p); sleep(getRandomInt(3)); } catch (Exception e) { output(p); } }); } public static List<String> getPage(int page) { String url = "http://www.***.net/ys/index_" + page + ".htm"; if (page == 1) url = "http://www.***.net/ys/"; output(url); HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); String content = response.getString("content"); byte[] bytes = content.getBytes(UTF_8); String all = new String(bytes, UTF_8); List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm"); return list; } public static List<String> getPage(String page) { String content = page; byte[] bytes = content.getBytes(UTF_8); String all = new String(bytes, UTF_8); List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm"); return list; } public static boolean getMovieInfo(int day, int index) { // String url = "http://www.***.net/ys/20180819/40981.htm"; String url = "http://www.***.net/ys/" + day + "/" + index + ".htm"; getMovieInfo(url); return true; } public static boolean getMovieInfo(String url) { HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); String s = response.getString("content"); if (s.contains("您查詢的內容不存在,請返回首頁從新搜索")) return false; byte[] bytes = s.getBytes(UTF_8); String all = new String(bytes, UTF_8); String name = EMPTY, tname = EMPTY, year = EMPTY, language = EMPTY, date = EMPTY, score = EMPTY, length = EMPTY, author = EMPTY; if (all.contains("◎")) { int i = all.indexOf("◎"); int i1 = all.indexOf("<hr"); String info = s.substring(i, i1); name = getInfo(info, "片 名 "); tname = getInfo(info, "譯 名 "); year = getInfo(info, "年 代 "); language = getInfo(info, "語 言 "); date = getInfo(info, "上映日期 "); score = getInfo(info, "豆瓣評分 "); length = getInfo(info, "片 長 "); author = getInfo(info, "導 演 "); } else { name = getInfo(all, "<title>"); if (name.contains("_")) name = name.substring(0, name.indexOf("_")); length = getInfo(all, "片長: "); date = getInfo(all, "上映日期: "); author = getInfo(all, "導演: "); language = getInfo(all, "語言: "); } List<String> magnets = regexAll(all, "magnet:.+?>"); List<String> ed2ks = regexAll(all, "ed2k:.+?>"); if (ed2ks.size() == 0) ed2ks = regexAll(all, "ftp://.+?>"); if (ed2ks.size() == 0) ed2ks = regexAll(all, "thunder://.+?>"); List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>"); String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");"; sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY)); if (ed2ks.size() != 0) MySqlTest.sendWork(sql); output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length()); output(sql); return true; } public static String getInfo(String text, String start) { String value = EMPTY; List<String> nameinfo = regexAll(text, start + ".+?<"); if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY); return value; }
----------------------------------分割線-------------------------------------------------sql
本人使用httpclient爬蟲過程當中,想爬取關注的一個電影網站的下載地址。在通過嘗試以後,終於成功爬取了幾百部熱門電影的信息和下載地址(電驢和磁力連接)。中間遇到了編碼格式,正則匹配不一致,還有重複連接過濾等問題,也都一一搞定。附上代碼,供你們參考。數據庫
關鍵信息隱藏,思路供你們參考:先去訪問列表頁,拿到詳情頁的連接,去重以後去訪問詳情頁,拿到相關信息和下載地址,存儲到數據庫中。編程
public class MyTest extends ApiLibrary { public static void main(String[] args) { DEFAULT_CHARSET = GB2312; for (int i = 0; i < 10; i++) { spider(1); } testOver(); } public static void spider(int pa) { String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", "); List<String> list = Arrays.asList(abc); page.removeAll(list); Set<String> truelist = new HashSet<>(); page.forEach(l -> truelist.add(l)); truelist.forEach(p -> { try { getMovieInfo(p); sleep(getRandomInt(3)); } catch (Exception e) { output(p); } }); } public static List<String> getPage(int page) { String url = "http://www.***.net/ys/index_" + page + ".htm"; if (page == 1) url = "http://www.***.net/ys/"; output(url); HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); String content = response.getString("content"); output(content); byte[] bytes = content.getBytes(UTF_8); String all = new String(bytes, UTF_8); List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm"); return list; } public static boolean getMovieInfo(int day, int index) { // String url = "http://www.***.net/ys/20180819/40981.htm"; String url = "http://www.***.net/ys/" + day + "/" + index + ".htm"; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); String s = response.getString("content"); if (s.contains("您查詢的內容不存在,請返回首頁從新搜索")) return false; byte[] bytes = s.getBytes(UTF_8); String all = new String(bytes, UTF_8); int i = all.indexOf("◎"); int i1 = all.indexOf("<hr"); String info = s.substring(i, i1); String name = getInfo(info, "片 名 "); String tname = getInfo(info, "譯 名 "); String year = getInfo(info, "年 代 "); String language = getInfo(info, "語 言 "); String date = getInfo(info, "上映日期 "); String score = getInfo(info, "豆瓣評分 "); String length = getInfo(info, "片 長 "); String author = getInfo(info, "導 演 "); List<String> magnets = regexAll(all, "magnet:.+?>"); List<String> ed2ks = regexAll(all, "ed2k:.+?>"); List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>"); String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");"; sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY)); MySqlTest.sendWork(sql); return true; } public static boolean getMovieInfo(String url) { HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponse(httpGet); String s = response.getString("content"); if (s.contains("您查詢的內容不存在,請返回首頁從新搜索")) return false; byte[] bytes = s.getBytes(UTF_8); String all = new String(bytes, UTF_8); int i = all.indexOf("◎"); int i1 = all.indexOf("<hr"); String info = s.substring(i, i1); String name = getInfo(info, "片 名 "); String tname = getInfo(info, "譯 名 "); String year = getInfo(info, "年 代 "); String language = getInfo(info, "語 言 "); String date = getInfo(info, "上映日期 "); String score = getInfo(info, "豆瓣評分 "); String length = getInfo(info, "片 長 "); String author = getInfo(info, "導 演 "); List<String> magnets = regexAll(all, "magnet:.+?>"); List<String> ed2ks = regexAll(all, "ed2k:.+?>"); List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>"); String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");"; sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY)); MySqlTest.sendWork(sql); output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length()); output(sql); return true; } public static String getInfo(String text, String start) { String value = EMPTY; List<String> nameinfo = regexAll(text, start + ".+?<"); if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY); return value; } }
下面是數據庫存儲的截圖: 框架