Jsoup網頁抓取工具demojava
1.項目是maven項目node
2.測試的主函數代碼以下:瀏覽器
package www.tydic.com.test; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Test1 { public static void main(String[] args) { // TODO Auto-generated method stub final String url = "https://www.oschina.net/project/lang/19/java"; try { /** * 使用瀏覽器模式訪問網址,避免403錯誤 */ Document doc = Jsoup.connect(url) .userAgent( "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31") .get(); Elements container = doc.getElementsByClass("ProjectList"); Document containerDoc = Jsoup.parse(container.toString()); Elements module = containerDoc.getElementsByClass("List"); Document moduledoc = Jsoup.parse(module.toString()); /** * 獲取class爲List的ul中的列表,而後進行遍歷 */ Elements recommElement = moduledoc.select(".recomm"); for (Element elem : recommElement) { Document clearfixliDoc = Jsoup.parse(elem.toString()); //獲取標題 Elements h3_text = clearfixliDoc.select("h3"); //獲取簡介 Elements detail_text = clearfixliDoc.select(".detail"); Document hrefDoc = Jsoup.parse(detail_text.toString()); //獲取更多信息的鏈接 Element moreEle = hrefDoc.select("a").first(); String absHref = moreEle.attr("href"); System.out.println("獲取相應的標題:"+h3_text.text()); System.out.println("獲取詳情簡介:"+detail_text.text()); System.out.println("獲取更多:"+moreEle.text()); System.out.println("獲取更多信息的鏈接:"+"https://www.oschina.net"+absHref); System.out.println("================================="); } } catch (IOException e) { e.printStackTrace(); } } }
主要是獲取開源中國的一些相關信息。maven
對於分頁內容暫時還沒思路,這個只是一個簡單demo函數