java 簡單的爬蟲

時間 2019-11-12

標籤 java 簡單爬蟲欄目 Java 简体版

原文原文鏈接

這裏介紹兩種方式 html

一.java

1.maven構建一個node

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>

2.新建一個OneSpider數組

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author lili
 */
public class OneSpider {

    public static void Get_Url(String url) {
        try {
        //用jsoup 獲取網頁
            Document doc = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
                    .timeout(3000)
                    .get();
            //.data("query", "Java")
            //.userAgent("頭部")
            //.cookie("auth", "token")
            //.timeout(3000)
            //.post()
            //獲得html的全部東西（能夠經過id 或者 class 獲取html）
            Element content = doc.getElementById("content");
  //下面就是獲取到的頁面的解析數據
//            Elements content = doc.getElementsByClass("j-content");
            System.out.println("com.mycompany.spiderdemo.Test.Get_Url()" + content);
            //分離出html下<a>...</a>之間的全部東西
//            Elements links = content.getElementsByTag("a");
            //Elements links = doc.select("a[href]");
            // 擴展名爲.png的圖片
            Elements pngs = doc.select("img[src$=.png]");
            // class等於masthead的div標籤
            Element masthead = doc.select("div.masthead").first();

//            for (Element link : links) {
//                //獲得<a>...</a>裏面的網址
//                String linkHref = link.attr("href");
//                //獲得<a>...</a>裏面的漢字
//                String linkText = link.text();
//                System.out.println(linkText);
//            }
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

3.新建一個主類cookie

public class JavaSpider {
    //main函數

    public static void main(String[] args) {

        String url = "www.baidu.com";
        
        Test.Get_Url(url);
    }
}

4.打印出數據就能夠看到了maven

二.這種方式更高效 ide

2.1 Save_Html方法是將抓取的網頁變成html文件，保存在本地函數

2.2 Get_Localhtml方法是解析本地的htmlpost

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author lili
 */
public class TwoSpiderTest {
    //將抓取的網頁變成html文件，保存在本地

    public static void Save_Html(String url) {
        try {
//存放html的文件夾須要先建好
            File dest = new File("src/temp_html/" + "1.html");
            //接收字節輸入流
            InputStream is;
            //字節輸出流
            FileOutputStream fos = new FileOutputStream(dest);

            URL temp = new URL(url);
            URLConnection uc = temp.openConnection();
            uc.addRequestProperty("User-Agent", "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5");
            is = temp.openStream();
            is = temp.openStream();
            //爲字節輸入流加緩衝             
            BufferedInputStream bis = new BufferedInputStream(is);
            //爲字節輸出流加緩衝
            BufferedOutputStream bos = new BufferedOutputStream(fos);

            int length;

            byte[] bytes = new byte[1024 * 20];
            while ((length = bis.read(bytes, 0, bytes.length)) != -1) {
                fos.write(bytes, 0, length);
            }

            bos.close();
            fos.close();
            bis.close();
            is.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    //解析本地的html
    public static void Get_Localhtml(String path) {
        //讀取本地html的路徑
        File file = new File(path);
        //生成一個數組用來存儲這些路徑下的文件名
        File[] array = file.listFiles();
        //寫個循環讀取這些文件的名字

        for (int i = 0; i < array.length; i++) {
            try {
                if (array[i].isFile()) {
                    //文件名字
                    System.out.println("正在解析網址：" + array[i].getName());

                    //下面開始解析本地的html
                    Document doc = Jsoup.parse(array[i], "UTF-8");
                    //獲得html的全部東西
                    Element content = doc.getElementById("content");
                    //分離出html下<a>...</a>之間的全部東西
                    Elements links = content.getElementsByTag("a");
                    //Elements links = doc.select("a[href]");
                    // 擴展名爲.png的圖片
                    Elements pngs = doc.select("img[src$=.png]");
                    // class等於masthead的div標籤
                    Element masthead = doc.select("div.masthead").first();

                    for (Element link : links) {
                        //獲得<a>...</a>裏面的網址
                        String linkHref = link.attr("href");
                        //獲得<a>...</a>裏面的漢字
                        String linkText = link.text();
                        System.out.println(linkText);
                    }
                }
            } catch (Exception e) {
                System.out.println("網址：" + array[i].getName() + "解析出錯");
                e.printStackTrace();
                continue;
            }
        }
    }
}

2.3 在主類裏面放入url地址就能夠了url

public class JavaSpider {
    //main函數

    public static void main(String[] args) {
        String url = "www.baidu.com";
        String path = "src/temp_html/";
//將html存放在本地
         TwoSpiderTest.Save_Html(url);
//解析本地的html
         TwoSpiderTest.Get_Localhtml(path);

    }
}