下面經過一個簡單的htmlparser的使用舉例,來學習htmlparser的使用。代碼以下: package com.amigo.htmlparser; import java.io.*; import java.net.URL; import java.net.URLConnection; import org.htmlparser.filters.*; import org.htmlparser.*; import org.htmlparser.nodes.*; import org.htmlparser.tags.*; import org.htmlparser.util.*; import org.htmlparser.visitors.*; /** *//** * 測試HTMLParser的使用. * @author <a href="mailto:xiexingxing1121@126.com">AmigoXie</a> * Creation date: 2008-1-18 - 上午11:44:22 */ public class HTMLParserTest { /** *//** * 入口方法. * @param args * @throws Exception */ public static void main(String args[]) throws Exception { String path = "http://www.blogjava.net/amigoxie"; URL url = new URL(path); URLConnection conn = url.openConnection(); conn.setDoOutput(true); InputStream inputStream = conn.getInputStream(); InputStreamReader isr = new InputStreamReader(inputStream, "utf8"); StringBuffer sb = new StringBuffer(); BufferedReader in = new BufferedReader(isr); String inputLine; while ((inputLine = in.readLine()) != null) { sb.append(inputLine); sb.append("\n"); } String result = sb.toString(); readByHtml(result); readTextAndLinkAndTitle(result); } /** *//** * 按頁面方式處理.解析標準的html頁面 * @param content 網頁的內容 * @throws Exception */ public static void readByHtml(String content) throws Exception { Parser myParser; myParser = Parser.createParser(content, "utf8"); HtmlPage visitor = new HtmlPage(myParser); myParser.visitAllNodesWith(visitor); String textInPage = visitor.getTitle(); System.out.println(textInPage); NodeList nodelist; nodelist = visitor.getBody(); System.out.print(nodelist.asString().trim()); } /** *//** * 分別讀純文本和連接. * @param result 網頁的內容 * @throws Exception */ public static void readTextAndLinkAndTitle(String result) throws Exception { Parser parser; NodeList nodelist; parser = Parser.createParser(result, "utf8"); NodeFilter textFilter = new NodeClassFilter(TextNode.class); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeFilter titleFilter = new NodeClassFilter(TitleTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter }); nodelist = parser.parse(lastFilter); Node[] nodes = nodelist.toNodeArray(); String line = ""; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof TextNode) { TextNode textnode = (TextNode) node; line = textnode.getText(); } else if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; line = link.getLink(); } else if (node instanceof TitleTag) { TitleTag titlenode = (TitleTag) node; line = titlenode.getTitle(); } if (isTrimEmpty(line)) continue; System.out.println(line); } } /** *//** * 去掉左右空格後字符串是否爲空 */ public static boolean isTrimEmpty(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } if (isBlank(astr.trim())) { return true; } return false; } /** *//** * 字符串是否爲空:null或者長度爲0. */ public static boolean isBlank(String astr) { if ((null == astr) || (astr.length() == 0)) { return true; } else { return false; } } }