JAVA爬蟲實踐(實踐四:webMagic和phantomjs和淘寶爬蟲)

webMagic雖然方便,可是也有它不適用的地方,好比定向的某個單頁面爬蟲,或者存在大量ajax請求,頁面的跳轉請求全都混淆在js裏。html

這時能夠用webMagic結合phantomjs來真實模擬頁面請求,即不單單獲取數據,而是將整個頁面完整渲染出來。雖然這樣會使爬蟲速度變慢不少,可是不失爲一種快捷方便的解決方法。java

PhantomJS是一個基於 WebKit 的服務器端JavaScript API。它全面支持web而不需瀏覽器支持,其快速,原生支持各類Web標準: DOM 處理, CSS 選擇器, JSON, Canvas, 和 SVG。 PhantomJS 能夠用於頁面自動化 , 網絡監測, 網頁截屏,以及 無界面測試 等。web

淘寶就是這種難以用普通爬蟲方法爬取的網站。直接發送GET請求到淘寶基本獲取不到什麼有效的內容和連接。ajax

還好webMagic雖然默認使用httpClient獲取網頁,可是它也將它獲取網頁的方法Downloader開放出來。這樣能夠在Downloader裏使用phantomjs獲取頁面。數據庫

phantomjs使用方法windows

1.下載安裝phantomjs瀏覽器

2.編寫js腳本服務器

 

system = require('system')   //傳遞一些須要的參數給js文件  
  
address = system.args[1];//得到命令行第二個參數 ,也就是指定要加載的頁面地址,接下來會用到    
  
var page = require('webpage').create();  
  
var url = address;  
  
  
page.open(url, function (status) {  
  
    if (status !== 'success') {  
  
        console.log('Unable to post!');  
    } else {  
  
        var encodings = ["euc-jp", "sjis", "utf8", "System"];//這一步是用來測試輸出的編碼格式,選擇合適的編碼格式很重要,否則你抓取下來的頁面會亂碼o(╯□╰)o,給出的幾個編碼格式是官網上的例子,根據具體須要本身去調整。  
  
        for (var i = 3; i < encodings.length; i++) {//我這裏只要一種編碼就OK啦  
  
            phantom.outputEncoding = encodings[i];  
  
            console.log(phantom.outputEncoding+page.content);//最後返回webkit加載以後的頁面內容  
        }  
  
    }  
    phantom.exit();  
});     
View Code

 

3.測試網絡

package util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.selector.PlainText;

public class GetAjaxHtml {
    public static String getAjaxContent(String url) throws Exception {
        Runtime rt = Runtime.getRuntime();
        Process p = rt
                .exec("D:/phantomjs-2.1.1-windows/bin/phantomjs.exe D:/s.js "
                        + url);
        InputStream is = p.getInputStream();
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        StringBuffer sbf = new StringBuffer();
        String tmp = "";
        while ((tmp = br.readLine()) != null) {
            sbf.append(tmp + "\n");
        }
        return sbf.toString();
    }

    public static Page download(Request request) {
        Page page = new Page();
        try {
            String url = request.getUrl();
            String html = getAjaxContent(url);
            page.setRawText(html);
            page.setUrl(new PlainText(url));
            page.setRequest(request);
            return page;
        } catch (Exception e) {
            System.out.println("download出錯了!");
            return page;
        }
    }

    public static void main(String[] args) throws Exception {
        long start = System.currentTimeMillis();
        String result = getAjaxContent("http://www.taobao.com");
        System.out.println(result);
        // 建立新文件
        String path = "D:\\testFile\\taobao.html";
        PrintWriter printWriter = null;
        printWriter = new PrintWriter(new FileWriter(new File(path)));
        printWriter.write(result);
        printWriter.close();
        long end = System.currentTimeMillis();
        System.out.println("===============耗時:" + (end - start)
                + "===============");
    }
}
View Code

webMagic結合phantomjs淘寶爬蟲app

package taobao;


import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.processor.PageProcessor;
import util.GetAjaxHtml;
import util.UuidUtil;
import csdnblog.dao.TaobaoDao;
import csdnblog.model.Taobao;

public class TaobaoPageProcessor implements PageProcessor {

    private TaobaoDao taobaoDao = new TaobaoDao();

    // 抓取網站的相關配置,包括:編碼、抓取間隔、重試次數等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        page.addTargetRequests(page.getHtml().links()
                .regex(".*item\\.taobao\\.com/item\\.htm\\?id=.*")
                .all());
        page.addTargetRequests(page.getHtml().links()
                .regex("https://s\\.taobao\\.com/list.*")
                .all());
        
        //若是是詳情頁
        if(page.getUrl().regex("https://item\\.taobao\\.com/item\\.htm\\?id=.*").match()) {
            
            Taobao taobao = new Taobao();
            taobao.setId(UuidUtil.getId());
            taobao.setUrl(page.getUrl().toString());
            taobao.setMaintitle(page.getHtml().xpath("//h3[@class='tb-main-title']/text()").get());
            taobao.setSubtitle(page.getHtml().xpath("//p[@class='tb-subtitle']/text()").get());
            taobao.setPrice(page.getHtml().xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()").get());
            taobao.setTaobaoprice(page.getHtml().xpath("//em[@id='J_PromoPriceNum']/text()").get());
            taobao.setRatecounter(page.getHtml().xpath("//strong[@id='J_RateCounter']/text()").get());
            taobao.setSellcounter(page.getHtml().xpath("//strong[@id='J_SellCounter']/text()").get());
            // 把對象存入數據庫
            taobaoDao.addTaobao(taobao);
            // 把對象輸出控制檯
            System.out.println(taobao.toString());
        }
    }

    public static void main(String[] args) {
        Spider.create(new TaobaoPageProcessor()).setDownloader(new Downloader() {
            
            @Override
            public void setThread(int threadNum) {
            }
            
            @Override
            public Page download(Request request, Task task) {
                return GetAjaxHtml.download(request);
            }
        }).addUrl("https://s.taobao.com/list?q=%E5%A4%B9%E5%85%8B&cat=50344007&style=grid&seller_type=taobao").thread(5).run();
    }
}
View Code

Model

package csdnblog.model;

public class Taobao {

    private String id;
    private String maintitle;
    private String subtitle;
    
    // url
    private String url;
    
    // 價格
    private String price;
    
    // 淘寶價
    private String taobaoprice;
    
    // 累計評價
    private String ratecounter;
    
    // 交易成功
    private String sellcounter;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getMaintitle() {
        return maintitle;
    }

    public void setMaintitle(String maintitle) {
        this.maintitle = maintitle;
    }

    public String getSubtitle() {
        return subtitle;
    }

    public void setSubtitle(String subtitle) {
        this.subtitle = subtitle;
    }

    public String getPrice() {
        return price;
    }

    public void setPrice(String price) {
        this.price = price;
    }

    public String getTaobaoprice() {
        return taobaoprice;
    }

    public void setTaobaoprice(String taobaoprice) {
        this.taobaoprice = taobaoprice;
    }

    public String getRatecounter() {
        return ratecounter;
    }

    public void setRatecounter(String ratecounter) {
        this.ratecounter = ratecounter;
    }

    public String getSellcounter() {
        return sellcounter;
    }

    public void setSellcounter(String sellcounter) {
        this.sellcounter = sellcounter;
    }

    public Taobao(String id, String maintitle, String subtitle, String url,
            String price, String taobaoprice, String ratecounter,
            String sellcounter) {
        super();
        this.id = id;
        this.maintitle = maintitle;
        this.subtitle = subtitle;
        this.url = url;
        this.price = price;
        this.taobaoprice = taobaoprice;
        this.ratecounter = ratecounter;
        this.sellcounter = sellcounter;
    }

    public Taobao() {
        super();
    }

    @Override
    public String toString() {
        return "Taobao [id=" + id + ", maintitle=" + maintitle + ", subtitle="
                + subtitle + ", url=" + url + ", price=" + price
                + ", taobaoprice=" + taobaoprice + ", ratecounter="
                + ratecounter + ", sellcounter=" + sellcounter + "]";
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

}
View Code
相關文章
相關標籤/搜索