不要再吹python爬蟲了，我大java明明也能夠 | java爬取CSDN、知乎文章

時間 2021-08-12

標籤 html java python 數據庫 python爬蟲 dom ide 學習 url 代理欄目 Python 简体版

原文原文鏈接

序言

這段時間一直在學習python，也看了不少python的文章，其中看到不少關於python爬蟲的文章。我就在想，明明java也能夠作到的事情，爲何你們都以爲爬蟲是python的專屬功能同樣？html

我以爲有必要爲我大java發個聲，趁午休時間搞了個java爬蟲給你們分享下java

導入相關包

引入爬蟲包jsouppython

<dependency>
   <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.2</version>
</dependency>

開始爬取

爬取文章列表

/**
 *
 * @param type
 *          爬取關鍵詞
 * @param size
 *          本次爬取文章條數: 不得小於0，不得大於1000
 * @return  爬取的文章列表
 * @throws IOException
 */
public static List<CrawlerArticle> searchCSDNList(String type, int size) throws IOException {
   if (size <= 0) {
        size = 100;
    } else if (size > 1000) {
        size = 1000;
    }
    int  num =1;
    //將爬取出來的文章封裝到Artcle中，並放到ArrayList裏面去
    List<CrawlerArticle> resultList = new ArrayList<CrawlerArticle>(size);
    while (true) {
        if (resultList.size() >= size) {
            break;
        }
//            String url = "https:" + type + "&t=blog&p=" + num;
        String url = "" + type ;

        //獲取url地址的http連接Connection
        Connection conn = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0")
                .timeout(1000)
                .method(Connection.Method.GET);
        //獲取頁面的html文檔
        Document doc = conn.get();
        Element body = doc.body();

        Elements articleList = body.getElementsByClass("clearfix");

        for (Element article : articleList) {
            CrawlerArticle articleEntity = new CrawlerArticle();
            //標題
            Elements div_h2_a = article.getElementsByClass("title").select("div h2 a");
            if (div_h2_a != null && div_h2_a.size() > 0) {
                Element linkNode = div_h2_a.get(0);
                //文章url
                articleEntity.setAddress(linkNode.attr("href"));
                articleEntity.setTitle(linkNode.text());
            } else {
                continue;
            }

            Elements subscribeNums = article.getElementsByClass("is_digg click_heart");
            if (subscribeNums != null && subscribeNums.size() > 0) {
                articleEntity.setSubscribeNum(getNum(subscribeNums));
            }else {
                articleEntity.setSubscribeNum(0);
            }

            Elements descNodes = article.getElementsByClass("summary oneline");
            if (descNodes != null && descNodes.size() > 0) {
                Element descNode = descNodes.get(0);
                articleEntity.setSecondTitle(descNode.text());
            }

            //閱讀量
            Elements readNums = article.getElementsByClass("read_num");
            if (readNums != null && readNums.size() > 0) {
                articleEntity.setReadNum(getNum(readNums));
            } else {
                continue;
            }

            Elements commonNums = article.getElementsByClass("common_num");
            if (commonNums != null && commonNums.size() > 0) {
                articleEntity.setCommentNum(getNum(commonNums));
            }else {
                articleEntity.setCommentNum(0);
            }
            Elements datetimes = article.getElementsByClass("datetime");
            if (datetimes != null && datetimes.size() > 0) {
                articleEntity.setPublishTime(datetimes.get(0).text());
            } else {
                articleEntity.setPublishTime(MyDateUtils.formatDate(new Date(), "yyyy-MM-dd"));
            }
            articleEntity.setBlogType("CSDN");

            System.out.println("文章原地址:" + articleEntity.getAddress());
            System.out.println("文章閱讀數+++++++++++:" + articleEntity.getReadNum());
            //將閱讀量大於100的url存儲到數據庫
            if (articleEntity.getReadNum() > 100) {
                resultList.add(articleEntity);
            }
            if (resultList.size() >= size) {
                break;
            }
        }
        //遍歷輸出ArrayList裏面的爬取到的文章
        System.out.println("文章總數++++++++++++:" + articleList.size());
        num++;
    }
    return resultList;
}

爬取單篇文章

/**
 *
 * @param url
 *          博客url地址
 * @param ipList
 *          代理池列表
 */
private static void search(String url, List<String> ipList) {
   Thread thread = new Thread() {
        @Override
        public void run() {
            Connection conn = null;
            Document doc = null;
            int retries = 0;
            out:
            while (true && retries < 10) {
                int random = new Random().nextInt(ipList.size());
                try {
                    conn = Jsoup.connect(url) 
                            .proxy(ipList.get(random), ipAndPort.get(ipList.get(random)))
                            .userAgent("Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0") 
                            .timeout(1000) 
                            .method(Connection.Method.GET);  
                    doc = conn.get();
                    break out;
                } catch (Exception e) {
                    retries++;
                }
            }

            //獲取頁面的html文檔
            try {
                String s = doc.outerHtml();
                String title = doc.title();
                System.out.println(title);
                //TODO 具體轉換成實體類可參考上面爬取文章列表
            } catch (Exception e) {
            }
        }
    };
    thread.start();

}

/**
 * 自建的代理ip池
 */
static Map<String, Integer> ipAndPort = new ConcurrentHashMap<>();
static {
    try {
        InputStream is = CrawlCSDN.class.getClassLoader().getResourceAsStream("ip.txt");
        //以IO流的形式讀取
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line ;
        while ((line=br.readLine()) != null) {
            String[] split = line.split(SymbolConstants.COLON_SYMBOL);
            if (split.length==2){
                ipAndPort.put(split[0],Integer.valueOf(split[1]));
            }
        }
        br.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}