1 import java.io.IOException; 2
3 import org.jsoup.Jsoup; 4 import org.jsoup.nodes.Document; 5 import org.jsoup.nodes.Element; 6 import org.jsoup.select.Elements; 7
8
9 public class WyCrawler { 10 public static void main(String[] args) { 11 try { 12 Document document = Jsoup.connect("http://某網頁").timeout(3000).get(); 13 String selector = "li>div[class=titleBar clearfix]>h3>a"; 14 Elements elements = document.select(selector); 15 for(Element element:elements){ 16 // System.out.println(element.text());
17 String url = element.absUrl("href"); 18 Document document2 = Jsoup.connect(url).get(); 19 Elements elements2 = document2.select("#endText"); 20 for(Element element2:elements2){ 21 System.out.println(element2.text()); 22 } 23 } 24 } catch (IOException e) { 25 e.printStackTrace(); 26 } 27 } 28 }
上面是如何爬取超連接裏的內容,下面的比較簡單java
1 import java.io.IOException; 2
3 import org.jsoup.Jsoup; 4 import org.jsoup.nodes.Document; 5 import org.jsoup.nodes.Element; 6 import org.jsoup.select.Elements; 7
8
9 public class Test { 10 public static void main(String[] args) { 11 try { 12 Document document = Jsoup.connect("http://www.某網頁.com/").get(); 13 //獲取內容 14 // String selector = "div[class=panel panel20 post-item post-box]>div[class=item-detail]>div[class=item-content]"; 15 // Elements elements = document.select(selector); 16 // for(Element element:elements){ 17 // System.out.println(element.text()); 18 // } 19
20 //獲取標題 21 // String selector2 = "div[class=panel panel20 post-item post-box]>div[class=item-detail]>h2[class=item-title]"; 22 // Elements elements = document.select(selector2); 23 // for(Element element:elements){ 24 // System.out.println(element.text()); 25 // } 26
27 //綜合寫法,標題內容一塊兒獲取
28 String selector = "div[class=panel panel20 post-item post-box]>div[class=item-detail]"; 29 Elements elements = document.select(selector); 30 for(Element element:elements){ 31 Elements titles = element.select("div[class=item-title]"); 32 Elements content = element.select("h2[class=item-content]"); 33 System.out.println(titles.text()+"\n"+content.text()); 34 } 35
36
37
38 } catch (IOException e) { 39 e.printStackTrace(); 40 } 41 } 42 }