最近在用Jsoup抓取某網站數據,可有些頁面是ajax請求動態生成的,去羣裏問了一下,大神說模擬ajax請求便可。去網上搜索了一下,發現了這篇文章,拿過來先用着試試。轉帖以下: javascript
網上關於網絡爬蟲實現方式有不少種,可是不少都不支持Ajax,李兄說:模擬纔是王道。確實,若是可以模擬一個沒有界面的瀏覽器,還有什麼不能作到的呢? 關於解析Ajax網站的框架也有很多,我選擇了HtmlUnit,官方網站:http://htmlunit.sourceforge.net /,htmlunit能夠說是一個Java版本的無界面瀏覽器,幾乎無所不能,並且不少東西都封裝得特別完美。這是這幾天來積累下來的心血,記錄一下。
package com.lanyotech.www.wordbank; html
import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import java.util.List; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.ScriptResult; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlOption; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlSelect; public class WorldBankCrawl { private static String TARGET_URL = "http://databank.worldbank.org/ddp/home.do"; public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { //模擬一個瀏覽器 WebClient webClient = new WebClient(); //設置webClient的相關參數 webClient.setJavaScriptEnabled(true); webClient.setCssEnabled(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); webClient.setTimeout(35000); webClient.setThrowExceptionOnScriptError(false); //模擬瀏覽器打開一個目標網址 HtmlPage rootPage= webClient.getPage(TARGET_URL); //獲取第一個數據庫 HtmlSelect hs = (HtmlSelect) rootPage.getElementById("lstCubes"); //按要求選擇第一個數據庫 hs.getOption(0).setSelected(true); //模擬點擊Next按鈕,跳轉到第二個頁面 System.out.println("正在跳轉…"); //執行按鈕出發的js事件 ScriptResult sr = rootPage.executeJavaScript("javascript:setCubeData(2,-1,4,'/ddp');"); //跳轉到第二個頁面,選擇國家 HtmlPage countrySelect = (HtmlPage) sr.getNewPage(); //得到包含所有國家信息的選擇框頁面 HtmlPage framePage=(HtmlPage)countrySelect.getFrameByName("frmTree1″).getEnclosedPage(); //得到selectAll按鈕,觸發js事件 framePage.executeJavaScript("javascript:TransferListAll(‘countrylst','countrylstselected','no');SetSelectedCount(‘countrylstselected','tdcount');"); //獲取Next按鈕,觸發js事件 ScriptResult electricityScriptResult = framePage.executeJavaScript("javascript:wrapperSetCube('/ddp')"); System.out.println("正在跳轉…"); //跳轉到下一個頁面electricitySelect HtmlPage electricitySelect = (HtmlPage) electricityScriptResult.getNewPage(); //得到electricity選擇的iframe HtmlPage electricityFrame = (HtmlPage) electricitySelect.getFrameByName("frmTree1″).getEnclosedPage(); //得到選擇框 HtmlSelect seriesSelect = (HtmlSelect) electricityFrame.getElementById("countrylst"); //得到全部的選擇框內容 List optionList = seriesSelect.getOptions(); //將指定的選項選中 optionList.get(1).setSelected(true); //模擬點擊select按鈕 electricityFrame.executeJavaScript("javascript:TransferList('countrylst','countrylstselected','no');SetSelectedCount('countrylstselected','tdcount');"); //獲取選中後,下面的選擇框 HtmlSelect electricitySelected = (HtmlSelect) electricityFrame.getElementById("countrylstselected"); List list = electricitySelected.getOptions(); //模擬點擊Next按鈕,跳轉到選擇時間的頁面 ScriptResult timeScriptResult = electricityFrame.executeJavaScript("javascript:wrapperSetCube('/ddp')"); System.out.println("正在跳轉…"); HtmlPage timeSelectPage = (HtmlPage) timeScriptResult.getNewPage(); //獲取選中時間的選擇框 timeSelectPage = (HtmlPage) timeSelectPage.getFrameByName("frmTree1″).getEnclosedPage(); //選中全部的時間 timeSelectPage.executeJavaScript("javascript:TransferListAll('countrylst','countrylstselected','no');SetSelectedCount('countrylstselected','tdcount');"); //點擊Next按鈕 ScriptResult exportResult = timeSelectPage.executeJavaScript("javascript:wrapperSetCube('/ddp')"); System.out.println("正在跳轉…"); //轉到export頁面 HtmlPage exportPage = (HtmlPage) exportResult.getNewPage(); //點擊頁面上的Export按鈕,進入下載頁面 ScriptResult downResult = exportPage.executeJavaScript("javascript:exportData('/ddp' ,'EXT_BULK' ,'WDI_Time=51||WDI_Series=1||WDI_Ctry=244||' );"); System.out.println("正在跳轉…"); HtmlPage downLoadPage = (HtmlPage) downResult.getNewPage(); //點擊Excel圖標,開始下載 ScriptResult downLoadResult = downLoadPage.executeJavaScript("javascript:exportData('/ddp','BULKEXCEL');"); //下載Excel文件 InputStream is = downLoadResult.getNewPage().getWebResponse().getContentAsStream(); OutputStream fos = new FileOutputStream("d://test.xls"); byte[] buffer=new byte[1024*30]; int len=-1; while((len=is.read(buffer))>0){ fos.write(buffer, 0, len); } fos.close(); fos.close(); System.out.println("Success!"); } }