webMagic雖然方便,可是也有它不適用的地方,好比定向的某個單頁面爬蟲,或者存在大量ajax請求,頁面的跳轉請求全都混淆在js裏。html
這時能夠用webMagic結合phantomjs來真實模擬頁面請求,即不單單獲取數據,而是將整個頁面完整渲染出來。雖然這樣會使爬蟲速度變慢不少,可是不失爲一種快捷方便的解決方法。java
PhantomJS是一個基於 WebKit 的服務器端JavaScript API。它全面支持web而不需瀏覽器支持,其快速,原生支持各類Web標準: DOM 處理, CSS 選擇器, JSON, Canvas, 和 SVG。 PhantomJS 能夠用於頁面自動化 , 網絡監測, 網頁截屏,以及 無界面測試 等。web
淘寶就是這種難以用普通爬蟲方法爬取的網站。直接發送GET請求到淘寶基本獲取不到什麼有效的內容和連接。ajax
還好webMagic雖然默認使用httpClient獲取網頁,可是它也將它獲取網頁的方法Downloader開放出來。這樣能夠在Downloader裏使用phantomjs獲取頁面。數據庫
phantomjs使用方法windows
1.下載安裝phantomjs瀏覽器
2.編寫js腳本服務器
system = require('system') //傳遞一些須要的參數給js文件 address = system.args[1];//得到命令行第二個參數 ,也就是指定要加載的頁面地址,接下來會用到 var page = require('webpage').create(); var url = address; page.open(url, function (status) { if (status !== 'success') { console.log('Unable to post!'); } else { var encodings = ["euc-jp", "sjis", "utf8", "System"];//這一步是用來測試輸出的編碼格式,選擇合適的編碼格式很重要,否則你抓取下來的頁面會亂碼o(╯□╰)o,給出的幾個編碼格式是官網上的例子,根據具體須要本身去調整。 for (var i = 3; i < encodings.length; i++) {//我這裏只要一種編碼就OK啦 phantom.outputEncoding = encodings[i]; console.log(phantom.outputEncoding+page.content);//最後返回webkit加載以後的頁面內容 } } phantom.exit(); });
3.測試網絡
package util; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; public class GetAjaxHtml { public static String getAjaxContent(String url) throws Exception { Runtime rt = Runtime.getRuntime(); Process p = rt .exec("D:/phantomjs-2.1.1-windows/bin/phantomjs.exe D:/s.js " + url); InputStream is = p.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuffer sbf = new StringBuffer(); String tmp = ""; while ((tmp = br.readLine()) != null) { sbf.append(tmp + "\n"); } return sbf.toString(); } public static Page download(Request request) { Page page = new Page(); try { String url = request.getUrl(); String html = getAjaxContent(url); page.setRawText(html); page.setUrl(new PlainText(url)); page.setRequest(request); return page; } catch (Exception e) { System.out.println("download出錯了!"); return page; } } public static void main(String[] args) throws Exception { long start = System.currentTimeMillis(); String result = getAjaxContent("http://www.taobao.com"); System.out.println(result); // 建立新文件 String path = "D:\\testFile\\taobao.html"; PrintWriter printWriter = null; printWriter = new PrintWriter(new FileWriter(new File(path))); printWriter.write(result); printWriter.close(); long end = System.currentTimeMillis(); System.out.println("===============耗時:" + (end - start) + "==============="); } }
webMagic結合phantomjs淘寶爬蟲app
package taobao; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.processor.PageProcessor; import util.GetAjaxHtml; import util.UuidUtil; import csdnblog.dao.TaobaoDao; import csdnblog.model.Taobao; public class TaobaoPageProcessor implements PageProcessor { private TaobaoDao taobaoDao = new TaobaoDao(); // 抓取網站的相關配置,包括:編碼、抓取間隔、重試次數等 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public Site getSite() { return site; } @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links() .regex(".*item\\.taobao\\.com/item\\.htm\\?id=.*") .all()); page.addTargetRequests(page.getHtml().links() .regex("https://s\\.taobao\\.com/list.*") .all()); //若是是詳情頁 if(page.getUrl().regex("https://item\\.taobao\\.com/item\\.htm\\?id=.*").match()) { Taobao taobao = new Taobao(); taobao.setId(UuidUtil.getId()); taobao.setUrl(page.getUrl().toString()); taobao.setMaintitle(page.getHtml().xpath("//h3[@class='tb-main-title']/text()").get()); taobao.setSubtitle(page.getHtml().xpath("//p[@class='tb-subtitle']/text()").get()); taobao.setPrice(page.getHtml().xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()").get()); taobao.setTaobaoprice(page.getHtml().xpath("//em[@id='J_PromoPriceNum']/text()").get()); taobao.setRatecounter(page.getHtml().xpath("//strong[@id='J_RateCounter']/text()").get()); taobao.setSellcounter(page.getHtml().xpath("//strong[@id='J_SellCounter']/text()").get()); // 把對象存入數據庫 taobaoDao.addTaobao(taobao); // 把對象輸出控制檯 System.out.println(taobao.toString()); } } public static void main(String[] args) { Spider.create(new TaobaoPageProcessor()).setDownloader(new Downloader() { @Override public void setThread(int threadNum) { } @Override public Page download(Request request, Task task) { return GetAjaxHtml.download(request); } }).addUrl("https://s.taobao.com/list?q=%E5%A4%B9%E5%85%8B&cat=50344007&style=grid&seller_type=taobao").thread(5).run(); } }
Model
package csdnblog.model; public class Taobao { private String id; private String maintitle; private String subtitle; // url private String url; // 價格 private String price; // 淘寶價 private String taobaoprice; // 累計評價 private String ratecounter; // 交易成功 private String sellcounter; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getMaintitle() { return maintitle; } public void setMaintitle(String maintitle) { this.maintitle = maintitle; } public String getSubtitle() { return subtitle; } public void setSubtitle(String subtitle) { this.subtitle = subtitle; } public String getPrice() { return price; } public void setPrice(String price) { this.price = price; } public String getTaobaoprice() { return taobaoprice; } public void setTaobaoprice(String taobaoprice) { this.taobaoprice = taobaoprice; } public String getRatecounter() { return ratecounter; } public void setRatecounter(String ratecounter) { this.ratecounter = ratecounter; } public String getSellcounter() { return sellcounter; } public void setSellcounter(String sellcounter) { this.sellcounter = sellcounter; } public Taobao(String id, String maintitle, String subtitle, String url, String price, String taobaoprice, String ratecounter, String sellcounter) { super(); this.id = id; this.maintitle = maintitle; this.subtitle = subtitle; this.url = url; this.price = price; this.taobaoprice = taobaoprice; this.ratecounter = ratecounter; this.sellcounter = sellcounter; } public Taobao() { super(); } @Override public String toString() { return "Taobao [id=" + id + ", maintitle=" + maintitle + ", subtitle=" + subtitle + ", url=" + url + ", price=" + price + ", taobaoprice=" + taobaoprice + ", ratecounter=" + ratecounter + ", sellcounter=" + sellcounter + "]"; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } }