先上效果圖:html
準備工做:segmentfault
/** * 創建http鏈接 */ public static String Connect(String address) { HttpURLConnection conn = null; URL url = null; InputStream in = null; BufferedReader reader = null; StringBuffer stringBuffer = null; try { url = new URL(address); conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(5000); conn.setReadTimeout(5000); conn.setDoInput(true); conn.connect(); in = conn.getInputStream(); reader = new BufferedReader(new InputStreamReader(in)); stringBuffer = new StringBuffer(); String line = null; while ((line = reader.readLine()) != null) { stringBuffer.append(line); } } catch (Exception e) { e.printStackTrace(); } finally { conn.disconnect(); try { in.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } } return stringBuffer.toString(); }
/** * 用於將內容寫入到磁盤文件 * @param allText */ private static void writeToFile(String allText) { System.out.println("正在寫入。。。"); BufferedOutputStream bos = null; try { File targetFile = new File("/Users/shibo/tmp/pengfu.txt"); File fileDir = targetFile.getParentFile(); if (!fileDir.exists()) { fileDir.mkdirs(); } if (!targetFile.exists()) { targetFile.createNewFile(); } bos = new BufferedOutputStream(new FileOutputStream(targetFile, true)); bos.write(allText.getBytes()); } catch (IOException e) { e.printStackTrace(); } finally { if (null != bos) { try { bos.close(); } catch (IOException e) { e.printStackTrace(); } } } System.out.println("寫入完畢。。。"); }
引入jsoup的jar包(用於解析dom):app
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.2</version> </dependency>
開始分析網站:dom
捧腹網段子
首先找到咱們須要的內容(做者、標題和正文)
網站
查看其元素,我這裏查看的是標題標籤:
ui
知道其結構以後,就能夠獲取咱們想要的內容了:url
public static void main(String[] args) { StringBuilder allText = new StringBuilder(); for (int i = 1; i <= 50; i++) { System.out.println("正在爬取第" + i + "頁內容。。。"); // 創建鏈接,獲取網頁內容 String html = ConnectionUtil.Connect("https://www.pengfu.com/xiaohua_" + i + ".html"); // 將內容轉換成dom格式,方便操做 Document doc = Jsoup.parse(html); // 獲取網頁內全部標題節點 Elements titles = doc.select("h1.dp-b"); for (Element titleEle : titles) { Element parent = titleEle.parent(); // 標題內容 String title = titleEle.getElementsByTag("a").text(); // 標題對應的做者 String author = parent.select("p.user_name_list > a").text(); // 標題對應的正文 String content = parent.select("div.content-img").text(); // 將內容格式化 allText.append(title) .append("\r\n做者:").append(author) .append("\r\n").append(content) .append("\r\n").append("\r\n"); } allText.append("-------------第").append(i).append("頁-------------").append("\r\n"); System.out.println("第" + i + "頁內容爬取完畢。。。"); } //將內容寫入磁盤 Test.writeToFile(allText.toString()); }
參考文章:Python 爬蟲入門(一)——爬取糗百spa