httpclient+jsoup實現網頁信息抓取

需求分析:抓取:http://tools.2345.com/rili.htm中的萬年曆(陽曆、陰曆等等)。html

1.首先爲抓取的內容建立一個類。實現封裝。

package com.wan.domain;

public class Almanac {
	private String solar;        /* 陽曆 e.g.2016年 4月11日 星期一 */
	private String lunar;        /* 陰曆 e.g. 猴年 三月初五*/
	private String chineseAra;    /* 天干地支紀年法 e.g.丙申年 壬辰月 癸亥日*/
	private String should;         /* 宜e.g. 求子 祈福 開光 祭祀 安牀*/
	private String avoid;         /* 忌 e.g. 玉堂(黃道)危日,忌出行*/
	public String getSolar() {
		return solar;
	}
	public void setSolar(String solar) {
		this.solar = solar;
	}
	public String getLunar() {
		return lunar;
	}
	public void setLunar(String lunar) {
		this.lunar = lunar;
	}
	public String getChineseAra() {
		return chineseAra;
	}
	public void setChineseAra(String chineseAra) {
		this.chineseAra = chineseAra;
	}
	public String getShould() {
		return should;
	}
	public void setShould(String should) {
		this.should = should;
	}
	public String getAvoid() {
		return avoid;
	}
	public void setAvoid(String avoid) {
		this.avoid = avoid;
	}
	 public Almanac(String solar, String lunar, String chineseAra, String should,String avoid) {
		 this.solar = solar;
		 this.lunar = lunar;
		 this.chineseAra = chineseAra;
		 this.should = should;
		 this.avoid = avoid;
     }
	
}

2.編寫邏輯,實現抓取(須要導入相應的jar包:commons-httpclient-3.0.1.jar、commons-logging.jar、httpcore-4.4.jar、jsoup-1.7.3.jar、org.apache.httpcomponents.httpclient_4.5.3.jar)

package com.wan.controller;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.wan.domain.Almanac;

public class AlmanacUtil {
	/**
     * 單例工具類
     */
    private AlmanacUtil() {
    }
    /**
     * 獲取萬年曆信息
     * @return
     */
    public static Almanac getAlmanac(){
        String url="http://tools.2345.com/rili.htm";
        String html=pickData(url);
        Almanac almanac=analyzeHTMLByString(html);
        return almanac;
    }
    
    /*
     * 爬取網頁信息
     */
    private static String pickData(String url) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        try {
            HttpGet httpget = new HttpGet(url);
            CloseableHttpResponse response = httpclient.execute(httpget);
            try {
                // 獲取響應實體
                HttpEntity entity = response.getEntity();
                // 打印響應狀態
                if (entity != null) {
                    return EntityUtils.toString(entity);
                }
            } finally {
                response.close();
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 關閉鏈接,釋放資源
            try {
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    
    /*
     * 使用jsoup解析網頁信息
     */
    private static Almanac analyzeHTMLByString(String html){
        String solarDate,lunarDate,chineseAra,should,avoid=" ";
        Document document = Jsoup.parse(html);
        //公曆時間
        solarDate=getSolarDate(document,"bjtime");
        //農曆時間
        Element eLunarDate=document.getElementById("info_nong");
        lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11);
        //天干地支紀年法
        Element eChineseAra=document.getElementById("info_chang");
        chineseAra=eChineseAra.text().toString();    
        //宜
        should=getSuggestion(document,"yi");
        //忌
        avoid=getSuggestion(document,"ji");
        Almanac almanac=new Almanac(solarDate,lunarDate,chineseAra,should,avoid);
        return almanac;
    }
    /*
     * 獲取忌/宜
     */
    private static String getSuggestion(Document doc,String id){
        Element element=doc.getElementById(id);
        Elements elements=element.getElementsByTag("a");
        StringBuffer sb=new StringBuffer();
        for (Element e : elements) {
            sb.append(e.text()+" ");
        }
        return sb.toString();
    }

    /*
     * 獲取公曆時間,用yyyy年MM月dd日 EEEE格式表示。
     * @return yyyy年MM月dd日 EEEE
     */
    private static String getSolarDate(Document doc,String id) {
        Calendar calendar = Calendar.getInstance();
        Date solarDate = calendar.getTime();
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE");
        return formatter.format(solarDate);
    }
}

注:公曆時間沒有實現網頁抓取。是獲取的系統的時間java

3.編寫測試

package com.wan.test;

import com.wan.controller.AlmanacUtil;
import com.wan.domain.Almanac;

public class Test {

	public static void main(String[] args) {
		// TODO Auto-generated method stub
		 	Almanac almanac=AlmanacUtil.getAlmanac();
	        System.out.println("公曆時間"+almanac.getSolar());
	        System.out.println("農曆時間"+almanac.getLunar());
	        System.out.println("天干地支"+almanac.getChineseAra());
	        System.out.println("宜"+almanac.getShould());
	        System.out.println("忌"+almanac.getAvoid());
	}

}

 

最後在控制檯輸出:node

 

相關文章
相關標籤/搜索