首先FetchMsg是一個用來讀取動態網頁,並把網頁源碼寫入StringBuffer實例裏面,html
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;java
public class FetchWebMsg
{
private StringBuffer content;
private BufferedReader br;
private String str = "";web
public String fetchWeb(String url)
throws Exception
{
URL newUrl = new URL(url);
URLConnection urlConnection = newUrl.openConnection();app
this.br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "utf-8"));
this.content = new StringBuffer();
while ((this.str = this.br.readLine()) != null)
{
this.content.append(this.str);
}
this.br.close();
return this.content.toString();
}fetch
public String fetchWeb(String url, String charset) throws Exception
{
URL newUrl = new URL(url);
URLConnection urlConnection = newUrl.openConnection();this
this.br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), charset));
this.content = new StringBuffer();
while ((this.str = this.br.readLine()) != null)
{
this.content.append(this.str);
}
this.br.close();
return this.content.toString(); }url
/*public static void main(String[] args) {
FetchWebMsg f = new FetchWebMsg();
try {
System.out.println(f.fetchWeb("http://www.weather.com.cn/html/weather/101120101.shtml"));
}
catch (Exception e) {
e.printStackTrace();
}
}*/
}.net
以上只是從網頁上動態讀取數據,下面是利用jdk自帶的Patern類和Matcher類去截取數據htm
import java.util.regex.Matcher;
import java.util.regex.Pattern;ip
public class DouBanBookWebMsg
{
public String getMsg()
throws Exception
{
String url = "http://book.douban.com/chart?";
String regx = null;
FetchWebMsg fetchMsg = new FetchWebMsg();
/**
* <p>description:\是轉義字符,\s是匹配空白符+表示匹配空白符一次或者屢次[^>]匹配除了>意外的字符.*?能夠匹配任意字符串其中.是匹配除了換行意外的任何字符<p>
* <p>description:*是重複零次或更屢次,?是重複零次或一次,.+?匹配除了換行意外的字符零次或者更屢次,\\s+.*?匹配任意的空白符零次或者屢次</p>
*
*
*
*/
regx = "<ul\\s*class=\"chart-dashed-list\">.*?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>(.+?)</li>(.+?)</ul>";
String webHtmlText = fetchMsg.fetchWeb("http://book.douban.com/chart?");
String result = geMsgByRegx(regx, webHtmlText);
return result;
}
public String geMsgByRegx(String regx, String webHtmlText)
{
StringBuffer result = new StringBuffer();
Pattern p = Pattern.compile(regx);
Matcher macher = p.matcher(webHtmlText);
while (macher.find())
{
result.append("[{\"photoAddr\":\"" + macher.group(1).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(2).trim() + "\"");
String[] authMsg = macher.group(3).trim().split("/");
result.append(",\"auth\":\"" + authMsg[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg[1] + "\"");
result.append(",\"press\":\"" + authMsg[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(4).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(5).trim() + "\"");
String[] authMsg1 = macher.group(6).trim().split("/");
result.append(",\"auth\":\"" + authMsg1[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg1[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(7).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(8).trim() + "\"");
String[] authMsg2 = macher.group(9).trim().split("/");
result.append(",\"auth\":\"" + authMsg2[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg2[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(10).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(11).trim() + "\"");
String[] authMsg3 = macher.group(12).trim().split("/");
result.append(",\"auth\":\"" + authMsg3[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg3[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(13).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(14).trim() + "\"");
String[] authMsg4 = macher.group(15).trim().split("/");
result.append(",\"auth\":\"" + authMsg4[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg4[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(16).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(17).trim() + "\"");
String[] authMsg5 = macher.group(18).trim().split("/");
result.append(",\"auth\":\"" + authMsg5[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg5[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(19).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(20).trim() + "\"");
String[] authMsg6 = macher.group(21).trim().split("/");
result.append(",\"auth\":\"" + authMsg6[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg6[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(22).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(23).trim() + "\"");
String[] authMsg7 = macher.group(24).trim().split("/");
result.append(",\"auth\":\"" + authMsg7[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg7[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(25).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(26).trim() + "\"");
String[] authMsg8 = macher.group(27).trim().split("/");
result.append(",\"auth\":\"" + authMsg8[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg8[2] + "\"}");
result.append(",{\"photoAddr\":\"" + macher.group(28).trim().replace("spic", "lpic") + "\"");
result.append(",\"bookName\":\"" + macher.group(29).trim() + "\"");
String[] authMsg9 = macher.group(30).trim().split("/");
result.append(",\"auth\":\"" + authMsg9[0] + "\"");
result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
result.append(",\"press\":\"" + authMsg9[2] + "\"}]");
}
return result.toString(); }}