java利用正則表達式截取網頁數據

首先FetchMsg是一個用來讀取動態網頁,並把網頁源碼寫入StringBuffer實例裏面,html

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;java

public class FetchWebMsg
{
  private StringBuffer content;
  private BufferedReader br;
  private String str = "";web

  public String fetchWeb(String url)
    throws Exception
  {
    URL newUrl = new URL(url);
    URLConnection urlConnection = newUrl.openConnection();app

    this.br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "utf-8"));
    this.content = new StringBuffer();
    while ((this.str = this.br.readLine()) != null)
    {
      this.content.append(this.str);
    }
    this.br.close();
    return this.content.toString();
  }fetch

  public String fetchWeb(String url, String charset) throws Exception
  {
    URL newUrl = new URL(url);
    URLConnection urlConnection = newUrl.openConnection();this

    this.br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), charset));
    this.content = new StringBuffer();
    while ((this.str = this.br.readLine()) != null)
    {
      this.content.append(this.str);
    }
    this.br.close();
    return this.content.toString(); }url

  /*public static void main(String[] args) {
    FetchWebMsg f = new FetchWebMsg();
    try {
      System.out.println(f.fetchWeb("http://www.weather.com.cn/html/weather/101120101.shtml"));
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }*/
}.net

以上只是從網頁上動態讀取數據,下面是利用jdk自帶的Patern類和Matcher類去截取數據htm

import java.util.regex.Matcher;
import java.util.regex.Pattern;ip

public class DouBanBookWebMsg
{
  public String getMsg()
    throws Exception
  {
    String url = "http://book.douban.com/chart?";
    String regx = null;

    FetchWebMsg fetchMsg = new FetchWebMsg();
   
    /**
     * <p>description:\是轉義字符,\s是匹配空白符+表示匹配空白符一次或者屢次[^>]匹配除了>意外的字符.*?能夠匹配任意字符串其中.是匹配除了換行意外的任何字符<p>
     * <p>description:*是重複零次或更屢次,?是重複零次或一次,.+?匹配除了換行意外的字符零次或者更屢次,\\s+.*?匹配任意的空白符零次或者屢次</p>
     *
     *
     *
     */

    regx = "<ul\\s*class=\"chart-dashed-list\">.*?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>(.+?)</li>(.+?)</ul>";

    String webHtmlText = fetchMsg.fetchWeb("http://book.douban.com/chart?");

    String result = geMsgByRegx(regx, webHtmlText);

    return result;
  }

  public String geMsgByRegx(String regx, String webHtmlText)
  {
    StringBuffer result = new StringBuffer();
    Pattern p = Pattern.compile(regx);
    Matcher macher = p.matcher(webHtmlText);
    while (macher.find())
    {
      result.append("[{\"photoAddr\":\"" + macher.group(1).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(2).trim() + "\"");
      String[] authMsg = macher.group(3).trim().split("/");
      result.append(",\"auth\":\"" + authMsg[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg[1] + "\"");
      result.append(",\"press\":\"" + authMsg[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(4).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(5).trim() + "\"");
      String[] authMsg1 = macher.group(6).trim().split("/");
      result.append(",\"auth\":\"" + authMsg1[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg1[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(7).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(8).trim() + "\"");
      String[] authMsg2 = macher.group(9).trim().split("/");
      result.append(",\"auth\":\"" + authMsg2[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg2[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(10).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(11).trim() + "\"");
      String[] authMsg3 = macher.group(12).trim().split("/");
      result.append(",\"auth\":\"" + authMsg3[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg3[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(13).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(14).trim() + "\"");
      String[] authMsg4 = macher.group(15).trim().split("/");
      result.append(",\"auth\":\"" + authMsg4[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg4[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(16).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(17).trim() + "\"");
      String[] authMsg5 = macher.group(18).trim().split("/");
      result.append(",\"auth\":\"" + authMsg5[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg5[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(19).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(20).trim() + "\"");
      String[] authMsg6 = macher.group(21).trim().split("/");
      result.append(",\"auth\":\"" + authMsg6[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg6[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(22).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(23).trim() + "\"");
      String[] authMsg7 = macher.group(24).trim().split("/");
      result.append(",\"auth\":\"" + authMsg7[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg7[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(25).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(26).trim() + "\"");
      String[] authMsg8 = macher.group(27).trim().split("/");
      result.append(",\"auth\":\"" + authMsg8[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg8[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(28).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(29).trim() + "\"");
      String[] authMsg9 = macher.group(30).trim().split("/");
      result.append(",\"auth\":\"" + authMsg9[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg9[2] + "\"}]");
    }

    return result.toString();  }}

相關文章
相關標籤/搜索