實現的效果,自動在工程下建立Pictures文件夾,根據網站URL爬取圖片,層層獲取。在Pictures下以網站的層級URL命名文件夾,用來裝該層URL下的圖片。同時將文件名,路徑,URL插入數據庫,便於索引。java
第一步,建立持久層類,用來存儲文件名,路徑以及URL。mysql
package org.amuxia.demo; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; public class JDBCHelper { private static final String driver = "com.mysql.jdbc.Driver"; private static final String DBurl = "jdbc:mysql://127.0.0.1:3306/edupic"; private static final String user = "root"; private static final String password = "root"; private PreparedStatement pstmt = null; private Connection spiderconn = null; public void insertFilePath(String fileName, String filepath, String url) { try { Class.forName(driver); spiderconn = DriverManager.getConnection(DBurl, user, password); String sql = "insert into FilePath (filename,filepath,url) values (?,?,?)"; pstmt = spiderconn.prepareStatement(sql); pstmt.setString(1, fileName); pstmt.setString(2, filepath); pstmt.setString(3, url); pstmt.executeUpdate(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } finally { try { pstmt.close(); spiderconn.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
第二步,建立解析URL的類,進行爬取web
package org.amuxia.demo; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Hashtable; import java.util.regex.Matcher; import java.util.regex.Pattern; public class GetWeb { private int webDepth = 5; // 爬蟲深度 private int intThreadNum = 1; // 線程數 private String strHomePage = ""; // 主頁地址 private String myDomain; // 域名 private String fPath = "CSDN"; // 儲存網頁文件的目錄名 private ArrayList<String> arrUrls = new ArrayList<String>(); // 存儲未處理URL private ArrayList<String> arrUrl = new ArrayList<String>(); // 存儲全部URL供創建索引 private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>(); // 存儲全部URL的網頁號 private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>(); // 存儲全部URL深度 private int intWebIndex = 0; // 網頁對應文件下標,從0開始 private long startTime; private int webSuccessed = 0; private int webFailed = 0; public static void main(String[] args) { GetWeb gw = new GetWeb("http://www.csdn.net/"); gw.getWebByHomePage(); } public GetWeb(String s) { this.strHomePage = s; } public GetWeb(String s, int i) { this.strHomePage = s; this.webDepth = i; } public synchronized void addWebSuccessed() { webSuccessed++; } public synchronized void addWebFailed() { webFailed++; } public synchronized String getAUrl() { String tmpAUrl = arrUrls.get(0); arrUrls.remove(0); return tmpAUrl; } public synchronized String getUrl() { String tmpUrl = arrUrl.get(0); arrUrl.remove(0); return tmpUrl; } public synchronized Integer getIntWebIndex() { intWebIndex++; return intWebIndex; } /** * 由用戶提供的域名站點開始,對全部連接頁面進行抓取 */ public void getWebByHomePage() { startTime = System.currentTimeMillis(); this.myDomain = getDomain(); if (myDomain == null) { System.out.println("Wrong input!"); return; } System.out.println("Homepage = " + strHomePage); System.out.println("Domain = " + myDomain); arrUrls.add(strHomePage); arrUrl.add(strHomePage); allUrls.put(strHomePage, 0); deepUrls.put(strHomePage, 1); File fDir = new File(fPath); if (!fDir.exists()) { fDir.mkdir(); } System.out.println("開始工做"); String tmp = getAUrl(); // 取出新的URL this.getWebByUrl(tmp, allUrls.get(tmp) + ""); // 對新URL所對應的網頁進行抓取 int i = 0; for (i = 0; i < intThreadNum; i++) { new Thread(new Processer(this)).start(); } while (true) { if (arrUrls.isEmpty() && Thread.activeCount() == 1) { long finishTime = System.currentTimeMillis(); long costTime = finishTime - startTime; System.out.println("\n\n\n\n\n完成"); System.out.println( "開始時間 = " + startTime + " " + "結束時間 = " + finishTime + " " + "爬取總時間= " + costTime + "ms"); System.out.println("爬取的URL總數 = " + (webSuccessed + webFailed) + " 成功的URL總數: " + webSuccessed + " 失敗的URL總數: " + webFailed); String strIndex = ""; String tmpUrl = ""; while (!arrUrl.isEmpty()) { tmpUrl = getUrl(); strIndex += "Web depth:" + deepUrls.get(tmpUrl) + " Filepath: " + fPath + "/web" + allUrls.get(tmpUrl) + ".htm" + "url:" + tmpUrl + "\n\n"; } System.out.println(strIndex); try { PrintWriter pwIndex = new PrintWriter(new FileOutputStream("fileindex.txt")); pwIndex.println(strIndex); pwIndex.close(); } catch (Exception e) { System.out.println("生成索引文件失敗!"); } break; } } } /** * 對後續解析的網站進行爬取 * * @param strUrl * @param fileIndex */ public void getWebByUrl(String strUrl, String fileIndex) { try { System.out.println("經過URL獲得網站: " + strUrl); URL url = new URL(strUrl); URLConnection conn = url.openConnection(); conn.setDoOutput(true); InputStream is = null; is = url.openStream(); String filename = strUrl.replaceAll("/", "_"); filename = filename.replace(":", "."); if (filename.indexOf("*") > 0) { filename = filename.replaceAll("*", "."); } if (filename.indexOf("?") > 0) { filename = filename.replaceAll("?", "."); } if (filename.indexOf("\"") > 0) { filename = filename.replaceAll("\"", "."); } if (filename.indexOf(">") > 0) { filename = filename.replaceAll(">", "."); } if (filename.indexOf("<") > 0) { filename = filename.replaceAll("<", "."); } if (filename.indexOf("|") > 0) { filename = filename.replaceAll("|", "."); } String filePath = fPath + "\\" + filename; File file = new File(filePath); if (!file.exists()) { file.mkdir(); } JDBCHelper helper = new JDBCHelper(); helper.insertFilePath(filename, filePath, strUrl); GetPicture getp = new GetPicture(); getp.get(strUrl, filePath); BufferedReader bReader = new BufferedReader(new InputStreamReader(is)); StringBuffer sb = new StringBuffer(); String rLine = null; String tmp_rLine = null; while ((rLine = bReader.readLine()) != null) { tmp_rLine = rLine; int str_len = tmp_rLine.length(); if (str_len > 0) { sb.append("\n" + tmp_rLine); if (deepUrls.get(strUrl) < webDepth) getUrlByString(tmp_rLine, strUrl); } tmp_rLine = null; } is.close(); System.out.println("獲取網站成功 " + strUrl); addWebSuccessed(); } catch (Exception e) { System.out.println("獲取網站失敗,請檢查URL是否存在 " + strUrl); addWebFailed(); } } /** * 判斷用戶所提供URL是否爲域名地址 * * @return */ public String getDomain() { String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv|edu)"; Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(strHomePage); boolean blnp = m.find(); if (blnp == true) { return m.group(0); } return null; } /** * 解析新的網頁,提取其中含有的連接信息 * * @param inputArgs * @param strUrl */ public void getUrlByString(String inputArgs, String strUrl) { String tmpStr = inputArgs; String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*"; Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(tmpStr); boolean blnp = m.find(); while (blnp == true) { if (!allUrls.containsKey(m.group(0))) { System.out.println("Find a new url,depth:" + (deepUrls.get(strUrl) + 1) + " " + m.group(0)); arrUrls.add(m.group(0)); arrUrl.add(m.group(0)); allUrls.put(m.group(0), getIntWebIndex()); deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1)); } tmpStr = tmpStr.substring(m.end(), tmpStr.length()); m = p.matcher(tmpStr); blnp = m.find(); } } /** * @author amuxia 另外一個獨立的爬取線程 */ class Processer implements Runnable { GetWeb gw; public Processer(GetWeb g) { this.gw = g; } public void run() { while (!arrUrls.isEmpty()) { String tmp = getAUrl(); getWebByUrl(tmp, allUrls.get(tmp) + ""); } } } }
如圖,此處添加要爬取的網址。正則表達式
private String fPath = "CSDN"; 這裏定義你爬取圖片存放的位置,這裏直接放在工程下的CSDN文件夾下,隨意放,本身找獲得就OK。sql
第三步,抓取圖片下載數據庫
package org.amuxia.demo; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; public class GetPicture { public void getHtmlPicture(String httpUrl, String filePath) { URL url; BufferedInputStream in; FileOutputStream file; try { System.out.println("爬取網絡圖片"); // 獲取圖片名 String fileName = httpUrl.substring(httpUrl.lastIndexOf("/")).replace("/", ""); // 初始化url對象 url = new URL(httpUrl); // 初始化in對象,也就是得到url字節流 in = new BufferedInputStream(url.openStream()); file = new FileOutputStream(new File(filePath + "\\" + fileName)); int t; while ((t = in.read()) != -1) { file.write(t); } file.close(); in.close(); System.out.println("圖片爬取成功"); } catch (Exception e) { e.printStackTrace(); } } public String getHtmlCode(String httpUrl) throws IOException { String content = ""; URL url = new URL(httpUrl); BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); String input; // 若是有數據 while ((input = reader.readLine()) != null) { // 將讀取數據賦給content content += input; } // 關閉緩衝區 reader.close(); // 返回content return content; } /** * 圖片爬取方法 * * @param url * @throws IOException */ public void get(String url, String filePath) throws IOException { // 定義兩個獲取網頁圖片的正則表達式 String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; String content = this.getHtmlCode(url); Pattern pattern = Pattern.compile(searchImgReg); Matcher matcher = pattern.matcher(content); while (matcher.find()) { System.out.println(matcher.group(3)); this.getHtmlPicture(url + "/" + matcher.group(3), filePath); } pattern = Pattern.compile(searchImgReg2); matcher = pattern.matcher(content); while (matcher.find()) { System.out.println(matcher.group(3)); this.getHtmlPicture(matcher.group(3), filePath); } } }
這樣就完成了!網絡
咱們看到,已經基本實現,這裏不須要額外導包,只須要導一個MySQL驅動包,固然,若是不須要插入數據到數據庫,對爬取圖片沒有任何影響,把第一個類去掉就好。app
另外可能有些網站作了防爬蟲可能會失敗。ide
注:爬取網站以前最好和全部者進行溝通,另外爬取非公開內容是侵權的,這裏只作測試使用。測試