HTMLParser的使用

時間 2019-12-06
原文原文鏈接
下面經過一個簡單的htmlparser的使用舉例，來學習htmlparser的使用。代碼以下：
package com.amigo.htmlparser;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;

import org.htmlparser.filters.*;
import org.htmlparser.*;
import org.htmlparser.nodes.*;
import org.htmlparser.tags.*;
import org.htmlparser.util.*;
import org.htmlparser.visitors.*;

/** *//**
 * 測試HTMLParser的使用.
 * @author <a href="mailto:xiexingxing1121@126.com">AmigoXie</a>
 * Creation date: 2008-1-18 - 上午11:44:22
 */
public class HTMLParserTest {
    /** *//**
     * 入口方法.
     * @param args
     * @throws Exception
     */
    public static void main(String args[]) throws Exception {
        String path = "http://www.blogjava.net/amigoxie";
        URL url = new URL(path);
        URLConnection conn = url.openConnection();
        conn.setDoOutput(true); 
        
        InputStream inputStream = conn.getInputStream();
        InputStreamReader isr = new InputStreamReader(inputStream, "utf8");
        StringBuffer sb = new StringBuffer();
        BufferedReader in = new BufferedReader(isr);
        String inputLine;
        
        while ((inputLine = in.readLine()) != null) {
            sb.append(inputLine);
            sb.append("\n");
        }
        
        String result = sb.toString();

        readByHtml(result);
        readTextAndLinkAndTitle(result);
    }
    
    /** *//**
     * 按頁面方式處理.解析標準的html頁面
     * @param content 網頁的內容
     * @throws Exception
     */
    public static void readByHtml(String content) throws Exception {
        Parser myParser;
        myParser = Parser.createParser(content, "utf8");
        HtmlPage visitor = new HtmlPage(myParser);
        myParser.visitAllNodesWith(visitor);

        String textInPage = visitor.getTitle();
        System.out.println(textInPage);
        NodeList nodelist;
        nodelist = visitor.getBody();
        
        System.out.print(nodelist.asString().trim());
    }

    /** *//**
     * 分別讀純文本和連接.
     * @param result 網頁的內容
     * @throws Exception
     */
    public static void readTextAndLinkAndTitle(String result) throws Exception {
        Parser parser;
        NodeList nodelist;
        parser = Parser.createParser(result, "utf8");
        NodeFilter textFilter = new NodeClassFilter(TextNode.class);
        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
        NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
        nodelist = parser.parse(lastFilter);
        Node[] nodes = nodelist.toNodeArray();
        String line = "";
        
        for (int i = 0; i < nodes.length; i++) {
            Node node = nodes[i];
            if (node instanceof TextNode) {
                TextNode textnode = (TextNode) node;
                line = textnode.getText();
            } else if (node instanceof LinkTag) {
                LinkTag link = (LinkTag) node;
                line = link.getLink();
            } else if (node instanceof TitleTag) {
                TitleTag titlenode = (TitleTag) node;
                line = titlenode.getTitle();
            }
            
            if (isTrimEmpty(line))
                continue;
            System.out.println(line);
        }
    }
    
    /** *//**
     * 去掉左右空格後字符串是否爲空
     */
    public static boolean isTrimEmpty(String astr) {
        if ((null == astr) || (astr.length() == 0)) {
            return true;
        }
        if (isBlank(astr.trim())) {
            return true;
        }
        return false;
    }

    /** *//**
     * 字符串是否爲空:null或者長度爲0.
     */
    public static boolean isBlank(String astr) {
        if ((null == astr) || (astr.length() == 0)) {
            return true;
        } else {
            return false;
        }
    }
}
相關標籤/搜索
每日一句
每一个你不满意的现在，都有一个你没有努力的曾经。