HtmlParser爬取網頁數據

時間 2019-12-13

原文原文鏈接

請尊重原創,轉載請註明出處:http://my.oschina.net/u/1789904/blog/386576html

核心:htmlparser框架java

HtmlParser爬取搜狗百科名人數據:node

/**
	 * 從百科搜索中獲取百科地址
	 * @param url
	 * @param charset
	 * @param timeOut
	 * @return
	 * @throws IOException
	 */
	private Map<String, String> parserBaike(String url, String charset, int timeOut) throws IOException {
		WebHttpClient util=new WebHttpClient();
		String content=util.getWebContentByGet(url,charset,timeOut);
		if(content == null){
			return null;
		}
		Map<String, String> map = new HashMap<>();
		Map<String, String> subMap = new HashMap<>();
		try {
			//開始解析
			Node node = null;
			/*********************　解析名字　**********************/
			// 過濾出class爲term的<span>元素  
			Parser parser = Parser.createParser(content, charset);
			AndFilter filter = new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("id","title"));
			NodeList nodeList = parser.parse(filter);
			
			for (int i = 0; i < nodeList.size(); i++) {
				node = nodeList.elementAt(i);
				map.put("name", node.toPlainTextString().trim());
			}
			
			/*********************　解析簡介　**********************/
			// 過濾出class爲start-time的<span>元素
			Parser parser2 = Parser.createParser(content, charset);
			AndFilter filter2 = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class","abstract"));
			NodeList nodeList2 = parser2.parse(filter2);
			
			for (int i = 0; i < nodeList2.size(); i++) {
				node = nodeList2.elementAt(i);
				String name = node.toPlainTextString().trim();
				System.out.println("name:" + name);
				map.put("intro", name);
			}
			
			// 過濾出id爲J_SingleEndTimeLabel的<span>元素
			Parser parser3 = Parser.createParser(content, charset);
			
			AndFilter filter3 = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class",""));
			NodeList nodeList3 = parser3.parse(filter3);
			
			for (int i = 0; i < nodeList3.size(); i++) {
				node = nodeList3.elementAt(i);
				String imgUrl = findHttp(node.toHtml());
				System.out.println("imgUrl:" + imgUrl);
				map.put("logo", imgUrl);
			}
			/*********************　解析表格數據　**********************/
			// 過濾出class爲box post的<div>元素
			Parser parser4 = Parser.createParser(content, charset);
			//AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_tbl"));
			AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_list"));
			NodeList tableList = parser4.extractAllNodesThatMatch(andFilter);
	        System.out.println("tableList.size:" + tableList.size());
	        
	        //tableList.size()  有兩個tableList
	       for (int i=0; i<tableList.size(); i++) {  
	              TableTag table = (TableTag) tableList.elementAt(i);  
	              //取得表中的行集  
	              TableRow[] rows = table.getRows();  
	              //遍歷每行             
	              for (int r=0; r<rows.length; r++) {  
	                  TableRow tr = rows[r]; 
	                  //行中的列和標題
	                  TableColumn[] td = tr.getColumns();  
	                  TableHeader[] header =tr.getHeaders();
	                  System.out.println("td.length:" + td.length);
	                  
	                  for (int c=0; c<td.length; c++) {  
	                	 String head = header[c].toPlainTextString();
	                     String col = td[c].toPlainTextString().trim();
	                    
	                     if (head.equals("出生地")) {
	                    	 System.out.println("======出生地：" + col);
	                    	 map.put("home", col);
	                     }
	                     subMap.put(head, col);
	                     System.out.println(head + ":" + col);
	                  }  
	              }  
	       } 
			
		} catch (ParserException e) {
			e.printStackTrace();
		}
		map.put("list",subMap.toString());
		return map;
	}

WebHttpClient.java

package org.jun.utils;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
/**
 * @author xiejunbo
 * */
public class WebHttpClient {
	
	public WebHttpClient(){
		
	}

	public String getWebContentByGet(String urlString, final String charset,
			int timeout) throws IOException {
		if (urlString == null || urlString.length() == 0) {
			return null;
		}
		urlString = (urlString.startsWith("http://") || urlString
				.startsWith("https://")) ? urlString : ("http://" + urlString)
				.intern();
		URL url = new URL(urlString);
		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
		conn.setRequestMethod("GET");
		// 增長報頭，模擬瀏覽器，防止屏蔽
		conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
		//conn.setRequestProperty("User-Agent","Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10");
		// 只接受text/html類型，固然也能夠接受圖片,pdf,*/*任意，就是tomcat/conf/web裏面定義那些
		conn.setRequestProperty("Accept", "text/html");
		conn.setConnectTimeout(timeout);
		try {
			if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
				return null;
			}
		} catch (IOException e) {
			e.printStackTrace();
			return null;
		}
		InputStream input = conn.getInputStream();
		BufferedReader reader = new BufferedReader(new InputStreamReader(input,charset));
		String line = null;
		StringBuffer sb = new StringBuffer();
		while ((line = reader.readLine()) != null) {
			sb.append(line).append("\r\n");
		}
		if (reader != null) {
			reader.close();
		}
		if (conn != null) {
			conn.disconnect();
		}
		return sb.toString();

	}

	public String getWebContentByGet(String urlString) throws IOException {
		return getWebContentByGet(urlString, "iso-8859-1", 5000);
	}

	public String getWebContentByPost(String urlString,String data, final String charset,
			int timeout)throws IOException{
		if (urlString == null || urlString.length() == 0) {
			return null;
		}
		urlString = (urlString.startsWith("http://") || urlString
				.startsWith("https://")) ? urlString : ("http://" + urlString).intern();
		URL url = new URL(urlString);
		HttpURLConnection connection = (HttpURLConnection) url.openConnection();
		 // 設置是否向connection輸出，由於這個是post請求，參數要放在  http正文內，所以須要設爲true
        connection.setDoOutput(true);   
        connection.setDoInput(true); 
        connection.setRequestMethod("POST");
        // Post 請求不能使用緩存   
        connection.setUseCaches(false);
        connection.setInstanceFollowRedirects(true);
        connection.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
        // 增長報頭，模擬瀏覽器，防止屏蔽
        connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows vista)");
        // 只接受text/html類型，固然也能夠接受圖片,pdf,*/*任意
        connection.setRequestProperty("Accept", "text/xml");
        connection.setConnectTimeout(timeout);
        connection.connect();
        DataOutputStream out = new DataOutputStream(connection.getOutputStream());
        String content = URLEncoder.encode(data, "utf-8");//+URLEncoder.encode("中文 ", "utf-8");
        out.writeBytes(content);
        out.flush();   
        out.close();
        
		try {
			//必須寫在發送數據的後面
			if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) {
				return null;
			}
		} catch (IOException e) {
			e.printStackTrace();
			return null;
		}
        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),charset));
        String line;
        StringBuffer sb=new StringBuffer();
        while ((line = reader.readLine()) != null) {
            sb.append(line).append("\r\n");
        }
        if (reader != null) {
			reader.close();
		}
		if (connection != null) {
			connection.disconnect();
		}
		return sb.toString();
	}
	public String getWebContentByPost(String urlString,String data) throws IOException {
		return getWebContentByPost(urlString, data,"iso-8859-1", 5000);
	}
	
	public static void main(String[] args) throws IOException {
		WebHttpClient client=new WebHttpClient();
//		String s = client.getWebContentByGet("http://www.baidu.com");
//		s = new String(s.getBytes("iso-8859-1"), "gb2312");
		
		String s = client.getWebContentByPost("http://localhost:8080/Lottery/login.portal","action=login&loginname=13761083826&password=111111");
		s = new String(s.getBytes("iso-8859-1"), "UTF-8");
		System.out.println(s);
	}
	

}