Java開發搜索引擎爬蟲

時間 2019-11-30
原文原文鏈接
  1 package com.peidon.html; 
  2 
  3 import java.io.BufferedReader;
  4 import java.io.File;
  5 import java.io.FileOutputStream;
  6 import java.io.IOException;
  7 import java.io.InputStream;
  8 import java.io.InputStreamReader;
  9 import java.net.HttpURLConnection;
 10 import java.net.URL;
 11 import java.net.URLConnection;
 12 
 13 import org.jsoup.Jsoup;
 14 import org.jsoup.nodes.Document;
 15 import org.jsoup.nodes.Element;
 16 import org.jsoup.select.Elements;
 17 
 18 
 19 /** 
 20 * @author sunshine 
 21 * @version 1.0
 22 * @date：2015年8月15日 上午9:01:13 
 23 * @description: Java開發搜索引擎爬蟲
 24 *     jsoup 相似jQuery的強大功能，什麼方便解析操做HTML  DOM 樹
 25 *     關聯jar包  jsoup-1.8.3.jar
 26 */ 
 27 public class HttpSoup {
 28 
 29     public static void main(String[] args) {
 30         
 31         //根據網址和頁面的編碼集  獲取網頁的源代碼
 32         String htmlResource = getHtmlResourceByUrl("http://www.ui.cn/", "UTF-8");
 33         //System.out.println(htmlResource);
 34         
 35         //解析源代碼
 36         Document document = Jsoup.parse(htmlResource);
 37         
 38         //獲取網頁的圖片
 39         //網頁圖片標籤<img src="" alt="" width="" height="" />
 40         Elements elements = document.getElementsByTag("img");
 41         
 42         for(Element element : elements){
 43             String imgSrc = element.attr("src");
 44             //System.out.println(imgSrc);
 45             downImages(imgSrc,"D:\\test\\images\\");
 46             System.out.println("下載成功:"+imgSrc);
 47             //System.out.println(imgSrc.substring(imgSrc.lastIndexOf("/")));
 48         }
 49     }
 50     
 51     /**
 52      * 根據一個圖片的URL地址，經過這個URL批量下載圖片到服務器的磁盤
 53      * @param imageUrl 要下載的服務器地址
 54      * @param filePath 下載完成後保存到服務器的圖片地址
 55      * 
 56      */
 57     public static void downImages(String imageUrl, String filePath){
 58         String fileName = imageUrl.substring(imageUrl.lastIndexOf("/"));
 59         
 60         try {
 61             //建立文件的目錄
 62             File files = new File(filePath);
 63             //判斷文件是否存在
 64             if(!files.exists()){
 65                 files.mkdirs();
 66             }
 67             //獲取圖片文件的下載地址
 68             URL url = new URL(imageUrl);
 69             //鏈接網絡圖片地址
 70             HttpURLConnection uc =(HttpURLConnection) url.openConnection();
 71             //獲取鏈接的輸出流
 72             InputStream is = uc.getInputStream();
 73             
 74             //建立文件
 75             File file = new File(filePath + fileName);
 76             //建立輸出流，寫入文件
 77             FileOutputStream out = new FileOutputStream(file);
 78             int i = 0;
 79             while((i = is.read()) != -1){
 80                 out.write(i);
 81             }
 82             is.close();
 83             out.close();
 84         } catch (Exception e) {
 85             e.printStackTrace();
 86         } 
 87     }
 88     
 89     /**
 90      * 根據網址和頁面的編碼集  獲取網頁的源代碼
 91      * @param url
 92      * @param encoding
 93      * @return
 94      */
 95     public static String getHtmlResourceByUrl(String url, String encoding){
 96         
 97         //聲明一個存儲網頁源代碼的容器
 98         StringBuffer buff = new StringBuffer();
 99         
100         URL urlObj = null;
101         URLConnection uc = null;
102         InputStreamReader in = null;
103         BufferedReader reader = null;
104         try {
105             //創建網絡連接
106             urlObj = new URL(url);
107             
108             //打開網絡鏈鏈接
109             uc = urlObj.openConnection();
110             
111             //創建網絡的輸入流
112             in = new InputStreamReader(uc.getInputStream(),encoding);
113             
114             //緩衝寫入的文件流
115             reader = new BufferedReader(in);
116             
117             String tempLine = null;
118             
119             //循環讀取文件流
120             while((tempLine = reader.readLine()) != null){
121                 buff.append(tempLine + "\n");  //循環追加數據
122             }
123             
124             
125             
126         } catch (Exception e) {
127             
128             e.printStackTrace();
129             System.out.println("Conection timeout ...");
130         } finally {
131             if(in != null){
132                 try {
133                     in.close();
134                 } catch (IOException e) {
135                     e.printStackTrace();
136                 }
137             }
138             
139         }
140         
141         return buff.toString();
142     }
143 }