這段時間一直在學習python,也看了不少python的文章,其中看到不少關於python爬蟲的文章。我就在想,明明java也能夠作到的事情,爲何你們都以爲爬蟲是python的專屬功能同樣?html
我以爲有必要爲我大java發個聲,趁午休時間搞了個java爬蟲給你們分享下java
引入爬蟲包jsouppython
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
/** * * @param type * 爬取關鍵詞 * @param size * 本次爬取文章條數: 不得小於0,不得大於1000 * @return 爬取的文章列表 * @throws IOException */ public static List<CrawlerArticle> searchCSDNList(String type, int size) throws IOException { if (size <= 0) { size = 100; } else if (size > 1000) { size = 1000; } int num =1; //將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去 List<CrawlerArticle> resultList = new ArrayList<CrawlerArticle>(size); while (true) { if (resultList.size() >= size) { break; } // String url = "https:" + type + "&t=blog&p=" + num; String url = "" + type ; //獲取url地址的http連接Connection Connection conn = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0") .timeout(1000) .method(Connection.Method.GET); //獲取頁面的html文檔 Document doc = conn.get(); Element body = doc.body(); Elements articleList = body.getElementsByClass("clearfix"); for (Element article : articleList) { CrawlerArticle articleEntity = new CrawlerArticle(); //標題 Elements div_h2_a = article.getElementsByClass("title").select("div h2 a"); if (div_h2_a != null && div_h2_a.size() > 0) { Element linkNode = div_h2_a.get(0); //文章url articleEntity.setAddress(linkNode.attr("href")); articleEntity.setTitle(linkNode.text()); } else { continue; } Elements subscribeNums = article.getElementsByClass("is_digg click_heart"); if (subscribeNums != null && subscribeNums.size() > 0) { articleEntity.setSubscribeNum(getNum(subscribeNums)); }else { articleEntity.setSubscribeNum(0); } Elements descNodes = article.getElementsByClass("summary oneline"); if (descNodes != null && descNodes.size() > 0) { Element descNode = descNodes.get(0); articleEntity.setSecondTitle(descNode.text()); } //閱讀量 Elements readNums = article.getElementsByClass("read_num"); if (readNums != null && readNums.size() > 0) { articleEntity.setReadNum(getNum(readNums)); } else { continue; } Elements commonNums = article.getElementsByClass("common_num"); if (commonNums != null && commonNums.size() > 0) { articleEntity.setCommentNum(getNum(commonNums)); }else { articleEntity.setCommentNum(0); } Elements datetimes = article.getElementsByClass("datetime"); if (datetimes != null && datetimes.size() > 0) { articleEntity.setPublishTime(datetimes.get(0).text()); } else { articleEntity.setPublishTime(MyDateUtils.formatDate(new Date(), "yyyy-MM-dd")); } articleEntity.setBlogType("CSDN"); System.out.println("文章原地址:" + articleEntity.getAddress()); System.out.println("文章閱讀數+++++++++++:" + articleEntity.getReadNum()); //將閱讀量大於100的url存儲到數據庫 if (articleEntity.getReadNum() > 100) { resultList.add(articleEntity); } if (resultList.size() >= size) { break; } } //遍歷輸出ArrayList裏面的爬取到的文章 System.out.println("文章總數++++++++++++:" + articleList.size()); num++; } return resultList; }
/** * * @param url * 博客url地址 * @param ipList * 代理池列表 */ private static void search(String url, List<String> ipList) { Thread thread = new Thread() { @Override public void run() { Connection conn = null; Document doc = null; int retries = 0; out: while (true && retries < 10) { int random = new Random().nextInt(ipList.size()); try { conn = Jsoup.connect(url) .proxy(ipList.get(random), ipAndPort.get(ipList.get(random))) .userAgent("Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0") .timeout(1000) .method(Connection.Method.GET); doc = conn.get(); break out; } catch (Exception e) { retries++; } } //獲取頁面的html文檔 try { String s = doc.outerHtml(); String title = doc.title(); System.out.println(title); //TODO 具體轉換成實體類可參考上面爬取文章列表 } catch (Exception e) { } } }; thread.start(); } /** * 自建的代理ip池 */ static Map<String, Integer> ipAndPort = new ConcurrentHashMap<>(); static { try { InputStream is = CrawlCSDN.class.getClassLoader().getResourceAsStream("ip.txt"); //以IO流的形式讀取 BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line ; while ((line=br.readLine()) != null) { String[] split = line.split(SymbolConstants.COLON_SYMBOL); if (split.length==2){ ipAndPort.put(split[0],Integer.valueOf(split[1])); } } br.close(); } catch (Exception e) { e.printStackTrace(); } }