import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 功能:爬取某姐的小視頻 * @author cxd * */ public class WebSpiderDemo1 { public static void main(String[] args) throws Exception { String source = "http://www.budejie.com/video/"; String destDir = "D:/rob/"; Map<String, String> urlMap = getUrlInSource(source); for (Map.Entry<String, String> entry : urlMap.entrySet()) { String title = entry.getKey();// 視頻名稱 String url = entry.getValue();// 視頻url File destFile = new File(destDir + title + ".mp4"); download(url, destFile); } } /** * 經過視頻的URL下載該視頻並存入本地 * * @param url 視頻的URL * @param destFile 視頻存入的位置 * @throws IOException */ public static void download(String url, File destFile) throws IOException { URL videoUrl = new URL(url); InputStream is = videoUrl.openStream(); FileOutputStream fos = new FileOutputStream(destFile); int len = 0; byte[] buffer = new byte[1024]; while ((-1) != (len = is.read(buffer))) { fos.write(buffer, 0, len); } fos.flush(); if (null != fos) { fos.close(); } if (null != is) { is.close(); } } /** * 獲取視頻的URL地址和視頻名稱存入hashMap * * @param source * @return * @throws IOException */ public static Map<String, String> getUrlInSource(String source) throws IOException { Map<String, String> hashMap = new HashMap<>(); for (int index = 1; index <= 1; index++) { // 頁數最大爲50,本身玩嘛,就只爬取了一頁。 String pageUrl = source + index; URL url = new URL(pageUrl); InputStream is = url.openStream(); // 若遇到反爬機制則使用該方法將程序假裝爲瀏覽器進行訪問 // HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // conn.setRequestMethod("GET"); // conn.setRequestProperty("user-agent", // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"); // BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String info = null; String title = null; // 此處不要用==null進行判斷,由於網頁中有不少行都是null,不然會報java.lang.NullPointerException。 for (int i = 0; i < 10000; i++) { info = br.readLine(); if (null != info) { String urlRegex = "href=\"(.*?\\.mp4)"; if (info.contains("data-text")) { title = info; } Pattern pattern = Pattern.compile(urlRegex); Matcher matcher = pattern.matcher(info); if (matcher.find()) { for (int j = 0; j <= matcher.groupCount(); j++) { String tmp = matcher.group(j); if (!tmp.startsWith("href=")) { String videoTitle = getTitle(title.trim()); hashMap.put(videoTitle, tmp); } } } } } } return hashMap; } /** * 清洗整理titile字符串, * * @param info * @return */ private static String getTitle(String info) { int len = info.length(); String title = info.substring(11, len - 2); return title; } }