我html 頁面元素:html
/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td[2]/font
/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td[2]/div/font/span
/html/body/table[2]/tbody/tr[3]/td/font/b
/html/body/table[2]/tbody/tr[5]/td/div/table/tbody/tr[1]/td[1]/div/b/font/spanjava
如下是代碼實現:node
import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupParseHtml { public static String getHtmlByUrl(String url){ String html = null; //建立httpClient對象 HttpClient httpClient = new DefaultHttpClient(); //以get方式請求該URL HttpGet httpget = new HttpGet(url); try { //獲得responce對象 HttpResponse responce = httpClient.execute(httpget); //返回碼 int resStatu = responce.getStatusLine().getStatusCode(); //200正常 其餘就不對 if (resStatu==HttpStatus.SC_OK) { //得到相應實體 HttpEntity entity = responce.getEntity(); if (entity!=null) { //得到html源代碼 html = EntityUtils.toString(entity); } } } catch (Exception e) { System.out.println("訪問【"+url+"】出現異常!"); e.printStackTrace(); } finally { httpClient.getConnectionManager().shutdown(); } return html; } static String txtpathstr="d:\\one\\"; public static void main(String[] args) throws Exception { String contents=""; String urlbase="http://localhost:8080/1.htm"; //String urlbase="http://www.qiushibaike.com/8hr/page/8?s=4513032";//1?s=4513032 contents+=gettxtlist(urlbase); //寫入文件 writefile(contents); } public static String gettxtlist(String txturl) throws Exception{ String content=""; Document doc=jsoupconnect(txturl,360000); //Elements els= doc.select("div.content"); Elements els= doc.select("html"); for(Element el:els){ if (el.select("body").size()>1){ continue; } content+=el.text()+"\r\n"; System.out.println(); System.out.println(content); } return content; } public static Document jsoupconnect (String url,int timeout){ Document doc=null; int retry=5; while (null==doc&&retry>0){ retry--; try{ doc= Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:5.0)").timeout(timeout).get(); }catch(Exception e){ e.printStackTrace(); } } return doc; } public static void writefile(String txtstr)throws Exception{ File txtpath=new File(txtpathstr); if (!txtpath.exists()){ txtpath.mkdirs(); } File htxt=new File(txtpathstr+"test.txt"); BufferedOutputStream outBuff = new BufferedOutputStream(new FileOutputStream(htxt)); outBuff.write(txtstr.getBytes()); outBuff.flush(); outBuff.close(); } }
存在問題:只能一次性讀取出來,不能按照要求,按照table分開,下一版本會解決這個問題apache