請尊重原創,轉載請註明出處:http://my.oschina.net/u/1789904/blog/386576html
核心:htmlparser框架java
HtmlParser爬取搜狗百科名人數據:node
/** * 從百科搜索中獲取百科地址 * @param url * @param charset * @param timeOut * @return * @throws IOException */ private Map<String, String> parserBaike(String url, String charset, int timeOut) throws IOException { WebHttpClient util=new WebHttpClient(); String content=util.getWebContentByGet(url,charset,timeOut); if(content == null){ return null; } Map<String, String> map = new HashMap<>(); Map<String, String> subMap = new HashMap<>(); try { //開始解析 Node node = null; /********************* 解析名字 **********************/ // 過濾出class爲term的<span>元素 Parser parser = Parser.createParser(content, charset); AndFilter filter = new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("id","title")); NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { node = nodeList.elementAt(i); map.put("name", node.toPlainTextString().trim()); } /********************* 解析簡介 **********************/ // 過濾出class爲start-time的<span>元素 Parser parser2 = Parser.createParser(content, charset); AndFilter filter2 = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class","abstract")); NodeList nodeList2 = parser2.parse(filter2); for (int i = 0; i < nodeList2.size(); i++) { node = nodeList2.elementAt(i); String name = node.toPlainTextString().trim(); System.out.println("name:" + name); map.put("intro", name); } // 過濾出id爲J_SingleEndTimeLabel的<span>元素 Parser parser3 = Parser.createParser(content, charset); AndFilter filter3 = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class","")); NodeList nodeList3 = parser3.parse(filter3); for (int i = 0; i < nodeList3.size(); i++) { node = nodeList3.elementAt(i); String imgUrl = findHttp(node.toHtml()); System.out.println("imgUrl:" + imgUrl); map.put("logo", imgUrl); } /********************* 解析表格數據 **********************/ // 過濾出class爲box post的<div>元素 Parser parser4 = Parser.createParser(content, charset); //AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_tbl")); AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_list")); NodeList tableList = parser4.extractAllNodesThatMatch(andFilter); System.out.println("tableList.size:" + tableList.size()); //tableList.size() 有兩個tableList for (int i=0; i<tableList.size(); i++) { TableTag table = (TableTag) tableList.elementAt(i); //取得表中的行集 TableRow[] rows = table.getRows(); //遍歷每行 for (int r=0; r<rows.length; r++) { TableRow tr = rows[r]; //行中的列和標題 TableColumn[] td = tr.getColumns(); TableHeader[] header =tr.getHeaders(); System.out.println("td.length:" + td.length); for (int c=0; c<td.length; c++) { String head = header[c].toPlainTextString(); String col = td[c].toPlainTextString().trim(); if (head.equals("出生地")) { System.out.println("======出生地:" + col); map.put("home", col); } subMap.put(head, col); System.out.println(head + ":" + col); } } } } catch (ParserException e) { e.printStackTrace(); } map.put("list",subMap.toString()); return map; }
WebHttpClient.java package org.jun.utils; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLEncoder; /** * @author xiejunbo * */ public class WebHttpClient { public WebHttpClient(){ } public String getWebContentByGet(String urlString, final String charset, int timeout) throws IOException { if (urlString == null || urlString.length() == 0) { return null; } urlString = (urlString.startsWith("http://") || urlString .startsWith("https://")) ? urlString : ("http://" + urlString) .intern(); URL url = new URL(urlString); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); // 增長報頭,模擬瀏覽器,防止屏蔽 conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); //conn.setRequestProperty("User-Agent","Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10"); // 只接受text/html類型,固然也能夠接受圖片,pdf,*/*任意,就是tomcat/conf/web裏面定義那些 conn.setRequestProperty("Accept", "text/html"); conn.setConnectTimeout(timeout); try { if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) { return null; } } catch (IOException e) { e.printStackTrace(); return null; } InputStream input = conn.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(input,charset)); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { sb.append(line).append("\r\n"); } if (reader != null) { reader.close(); } if (conn != null) { conn.disconnect(); } return sb.toString(); } public String getWebContentByGet(String urlString) throws IOException { return getWebContentByGet(urlString, "iso-8859-1", 5000); } public String getWebContentByPost(String urlString,String data, final String charset, int timeout)throws IOException{ if (urlString == null || urlString.length() == 0) { return null; } urlString = (urlString.startsWith("http://") || urlString .startsWith("https://")) ? urlString : ("http://" + urlString).intern(); URL url = new URL(urlString); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); // 設置是否向connection輸出,由於這個是post請求,參數要放在 http正文內,所以須要設爲true connection.setDoOutput(true); connection.setDoInput(true); connection.setRequestMethod("POST"); // Post 請求不能使用緩存 connection.setUseCaches(false); connection.setInstanceFollowRedirects(true); connection.setRequestProperty("Content-Type","application/x-www-form-urlencoded"); // 增長報頭,模擬瀏覽器,防止屏蔽 connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows vista)"); // 只接受text/html類型,固然也能夠接受圖片,pdf,*/*任意 connection.setRequestProperty("Accept", "text/xml"); connection.setConnectTimeout(timeout); connection.connect(); DataOutputStream out = new DataOutputStream(connection.getOutputStream()); String content = URLEncoder.encode(data, "utf-8");//+URLEncoder.encode("中文 ", "utf-8"); out.writeBytes(content); out.flush(); out.close(); try { //必須寫在發送數據的後面 if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) { return null; } } catch (IOException e) { e.printStackTrace(); return null; } BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),charset)); String line; StringBuffer sb=new StringBuffer(); while ((line = reader.readLine()) != null) { sb.append(line).append("\r\n"); } if (reader != null) { reader.close(); } if (connection != null) { connection.disconnect(); } return sb.toString(); } public String getWebContentByPost(String urlString,String data) throws IOException { return getWebContentByPost(urlString, data,"iso-8859-1", 5000); } public static void main(String[] args) throws IOException { WebHttpClient client=new WebHttpClient(); // String s = client.getWebContentByGet("http://www.baidu.com"); // s = new String(s.getBytes("iso-8859-1"), "gb2312"); String s = client.getWebContentByPost("http://localhost:8080/Lottery/login.portal","action=login&loginname=13761083826&password=111111"); s = new String(s.getBytes("iso-8859-1"), "UTF-8"); System.out.println(s); } }