import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TopitMe { private static final String URL_HOME = "http://www.topit.me"; private static final Logger LOGGER = LoggerFactory.getLogger(TopitMe.class); //解決重定向 private Document getHtml(String theme,String num) throws IOException{ Map<String, String> cookies = Jsoup.connect(URL_HOME) .execute().cookies(); cookies.put("item-tip", "true"); cookies.put("tip_global_1", "true"); cookies.put("is_click", "1"); Document doc = Jsoup.connect(URL_HOME+ "/" + theme + "/" + num).cookies(cookies).get(); return doc; } //專輯 public void getAlbum(String num) throws IOException{ int count = 0; int success = 0; int fail = 0; int ii = 1; String title = ""; //這個循環是爲了分頁,寫死最多10頁,懶得去單獨取了。 for (int i = 1; i < 10; i++) { ii = i-1; Document doc = null; try { doc = getHtml("album" , num +"?p=" + i); } catch (Exception e1) { //e1.printStackTrace(); LOGGER.error("打開頁面失敗,編號={},頁碼={} ",num,i); break; } if(doc == null) break; //找不到標題就提早退出 Element element = doc.select("div.userinfo_blk h2").first(); if(element == null) { LOGGER.error("抓取標題失敗,編號={},頁碼={} ",num,i); break; } //專輯名稱 title = element.text(); //校驗文件名是否合法 if(!filterName(title) || title.equals("")){ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); title = dateFormat.format(new Date()); } Elements imgs = doc.select("div.catalog img[src]"); if(imgs.isEmpty()) { LOGGER.error("抓取圖片標籤失敗,編號={},頁碼={} ",num,i); break; } for (int j = 0; j < imgs.size(); j++) { Element img = imgs.get(j); //img地址 ,地址放的屬性不同,真是見鬼了 String href = img.attr("data-original"); if(href == null || href.equals("")){ href = img.attr("src"); } //屬性不同.... String fileName = img.attr("alt"); if(fileName == null || fileName.equals("")){ fileName = img.attr("title"); } if(!filterName(fileName) || fileName.equals("")){ fileName = String.valueOf(new Date().getTime()); } count++; //取大圖,m--j if(href.contains("/m/")){ String newHref = href.replace("/m/", "/l/"); try { download(title , i + "-" + (j+1) + "-" + fileName, newHref); //System.out.println(newHref); success ++; } catch (Exception e) { //System.err.println(newHref); fail++; LOGGER.error("下載失敗,編號={},專輯={},頁碼={},地址={} ",num,title,i,newHref); } }else{ String newHref = href.substring(0, href.length()-5) + "l" + href.substring(href.length()-4); try { download(title , i + "-" + (j+1) + "-" + fileName, newHref); //System.out.println(newHref); success ++; } catch (Exception e) { //System.err.println(newHref); fail++; LOGGER.error("下載失敗,編號={},專輯={},頁碼={},地址={} ",num,title,i,newHref); } } } } //System.out.println("[統計]編號: "+num+",抓取: "+count+",成功 : " + success + ",失敗: " + fail); LOGGER.info("[統計]編號={},專輯={},抓取={},總計={},成功={},失敗={}",num,title,ii,count,success,fail); } //校驗文件名合法性 private boolean filterName(String name){ String[] filter = {"\\", "/" , ":" , "*" , "?" , "<" , ">" , "|" }; for (String string : filter) { if(name.contains(string)){ return false; } } return true; } //下載 private void download(String dir ,String fileName, String href) throws IOException { URL url = new URL(href); URLConnection conn = url.openConnection(); InputStream inStream = conn.getInputStream(); byte[] buffer = new byte[inStream.available()]; int length; File savefile = new File("F:/picture/" + dir); if (!savefile.exists()) { // 建立分離目錄 savefile.mkdirs(); } FileOutputStream fos = new FileOutputStream("F:/picture/" + dir + "/" + fileName + ".jpg"); while ((length = inStream.read(buffer)) != -1) { fos.write(buffer, 0, length); } fos.close(); inStream.close(); } } //用法: public static void main(String[] args) throws Exception { //http://www.topit.me/album/12598 //專輯編號 TopitMe me = new TopitMe(); me.getAlbum("12598"); }