Nodejs學習筆記(十一)—數據採集器示例(request和cheerio)

寫在以前

  不少人都有作數據採集的需求,用不一樣的語言,不一樣的方式都能實現,我之前也用C#寫過,主要仍是發送各種請求和正則解析數據比較繁瑣些,整體來講沒啥很差的,就是效率要差一些,html

  用nodejs寫採集程序仍是比較有效率(可能也只是相對C#來講),今天主要用一個示例來講一下使用nodejs實現數據採集器,主要使用到request和cheerio。node

  request :用於http請求jquery

  https://github.com/request/requestgit

  cheerio:用於提取request返回的html中須要的信息(和jquery用法一致)github

  https://github.com/cheeriojs/cheerionpm

示例

  單獨去說API用法沒什麼意思也不必記住所有API,下面開始示例json

  仍是說點閒話:api

  nodejs開發工具仍是不少,之前我也很推薦sublime,自從微軟推出了Visual Studio Code後就轉用它去作nodejs開發。瀏覽器

  用它開發仍是比較舒服的,免配置、啓動快、自動補全、查看定義和引用、搜索快等,有VS的一向風格,應該會越作越好,因此推薦一下^_^!app

  示例要求

  從 http://36kr.com/ 中抓取其中文章的「標題」、「地址」、「發佈時間」、「封面圖片」

  採集器

  1.創建項目文件夾sampleDAU

  2.創建package.json文件 

{
  "name": "Wilson_SampleDAU",
  "version": "0.0.1",
  "private": false,
  "dependencies": {
    "request":"*",
    "cheerio":"*"   
  }
}

  3.在終端中用npm安裝引用      

cd 項目根目錄
npm install

  4.創建app.js編寫採集器代碼

  首先要用瀏覽器打開要採集的URL,使用開發者工具查看HTML結構,而後根據結構寫解析代碼

/*
* 功能:   數據採集
* 建立人: Wilson
* 時間:   2015-07-29
*/
var request = require('request'),
    cheerio = require('cheerio'), 
    URL_36KR = 'http://36kr.com/';            //36氪 

/* 開啓數據採集器 */
 function dataCollectorStartup() {         
     dataRequest(URL_36KR);
 }

/* 數據請求 */
function dataRequest(dataUrl)
{
    request({
        url: dataUrl,    
        method: 'GET'
    }, function(err, res, body) { 
        if (err) {            
            console.log(dataUrl)
            console.error('[ERROR]Collection' + err);        
            return;            
        }

        switch(dataUrl)
        {
            case URL_36KR:

                dataParse36Kr(body);

                break;        
        }

        
    });    
}

/* 36kr 數據解析 */
function dataParse36Kr(body)
{
    console.log('============================================================================================');
    console.log('======================================36kr==================================================');
    console.log('============================================================================================');    

    var $ = cheerio.load(body);
        
    var articles = $('article')

    for (var i = 0; i < articles.length; i++) {
        var article = articles[i];
        var descDoms = $(article).find('.desc');

        if(descDoms.length == 0)
        {
            continue;
        }
        
        var coverDom = $(article).children().first();
        var titleDom = $(descDoms).find('.info_flow_news_title');
        var timeDom = $(descDoms).find('.timeago');

        var titleVal =  titleDom.text();
        var urlVal = titleDom.attr('href');
        var timeVal = timeDom.attr('title');
        var coverUrl = coverDom.attr('data-lazyload');

        //處理時間
        var timeDateSecs = new Date(timeVal).getTime() / 1000;

        if(urlVal != undefined)
        {
             console.info('--------------------------------');
             console.info('標題:' + titleVal);
             console.info('地址:' + urlVal);
             console.info('時間:' + timeDateSecs);
             console.info('封面:' + coverUrl);                
             console.info('--------------------------------');
        }
    };
}

dataCollectorStartup();

  測試結果

  這個採集器就完成了,其實就是request一個get請求,請求回調中會返回body即HTML代碼,經過cheerio庫以jquery庫語法同樣操做解析,取出想要的數據! 

加入代理

  作一個採集器DEMO上面就基本完成了。若是須要長期使用爲了防止網站屏蔽,仍是須要加入一個代理列表

  爲示例我從網上的免費代理中提出一些作示例,製做成proxylist.js,其中提供一個隨機取一條代理的函數

proxylist.js

var PROXY_LIST = [{"ip":"111.1.55.136","port":"55336"},{"ip":"111.1.54.91","port":"55336"},{"ip":"111.1.56.19","port":"55336"}
                    ,{"ip":"112.114.63.16","port":"55336"},{"ip":"106.58.63.83","port":"55336"},{"ip":"119.188.133.54","port":"55336"}
                    ,{"ip":"106.58.63.84","port":"55336"},{"ip":"183.95.132.171","port":"55336"},{"ip":"11.12.14.9","port":"55336"}
                    ,{"ip":"60.164.223.16","port":"55336"},{"ip":"117.185.13.87","port":"8080"},{"ip":"112.114.63.20","port":"55336"}
                    ,{"ip":"188.134.19.102","port":"3129"},{"ip":"106.58.63.80","port":"55336"},{"ip":"60.164.223.20","port":"55336"}
                    ,{"ip":"106.58.63.78","port":"55336"},{"ip":"112.114.63.23","port":"55336"},{"ip":"112.114.63.30","port":"55336"}
                    ,{"ip":"60.164.223.14","port":"55336"},{"ip":"190.202.82.234","port":"3128"},{"ip":"60.164.223.15","port":"55336"}
                    ,{"ip":"60.164.223.5","port":"55336"},{"ip":"221.204.9.28","port":"55336"},{"ip":"60.164.223.2","port":"55336"}
                    ,{"ip":"139.214.113.84","port":"55336"} ,{"ip":"112.25.49.14","port":"55336"},{"ip":"221.204.9.19","port":"55336"}
                    ,{"ip":"221.204.9.39","port":"55336"},{"ip":"113.207.57.18","port":"55336"} ,{"ip":"112.25.62.15","port":"55336"}
                    ,{"ip":"60.5.255.143","port":"55336"},{"ip":"221.204.9.18","port":"55336"},{"ip":"60.5.255.145","port":"55336"}
                    ,{"ip":"221.204.9.16","port":"55336"},{"ip":"183.232.82.132","port":"55336"},{"ip":"113.207.62.78","port":"55336"}
                    ,{"ip":"60.5.255.144","port":"55336"} ,{"ip":"60.5.255.141","port":"55336"},{"ip":"221.204.9.23","port":"55336"}
                    ,{"ip":"157.122.96.50","port":"55336"},{"ip":"218.61.39.41","port":"55336"} ,{"ip":"221.204.9.26","port":"55336"}
                    ,{"ip":"112.112.43.213","port":"55336"},{"ip":"60.5.255.138","port":"55336"},{"ip":"60.5.255.133","port":"55336"} 
                    ,{"ip":"221.204.9.25","port":"55336"},{"ip":"111.161.35.56","port":"55336"},{"ip":"111.161.35.49","port":"55336"}
                    ,{"ip":"183.129.134.226","port":"8080"} ,{"ip":"58.220.10.86","port":"80"},{"ip":"183.87.117.44","port":"80"}
                    ,{"ip":"211.23.19.130","port":"80"},{"ip":"61.234.249.107","port":"8118"},{"ip":"200.20.168.140","port":"80"}
                    ,{"ip":"111.1.46.176","port":"55336"},{"ip":"120.203.158.149","port":"8118"},{"ip":"70.39.189.6","port":"9090"} 
                    ,{"ip":"210.6.237.191","port":"3128"},{"ip":"122.155.195.26","port":"8080"}];
        

module.exports.GetProxy = function () {
        
    var randomNum = parseInt(Math.floor(Math.random() * PROXY_LIST.length));    
    var proxy = PROXY_LIST[randomNum];

    return 'http://' + proxy.ip + ':' + proxy.port;
}

  對app.js代碼作以下修改

/*
* 功能:   數據採集
* 建立人: Wilson
* 時間:   2015-07-29
*/
var request = require('request'),
    cheerio = require('cheerio'), 
    URL_36KR = 'http://36kr.com/',            //36氪
    Proxy = require('./proxylist.js'); 

...

/* 數據請求 */
function dataRequest(dataUrl)
{
    request({
        url: dataUrl,    
        proxy: Proxy.GetProxy(),
        method: 'GET'
    }, function(err, res, body) { 
...
        }

}

...

dataCollectorStartup()
setInterval(dataCollectorStartup, 10000);

  這樣就改造完成,加入代碼,而且加了setInterval進行定間隔執行!

請求https

  上面示例中採集http請求,若是換成https呢?

  新建app2.js,代碼以下

/*
* 功能:   請求HTTPS
* 建立人: Wilson
* 時間:   2015-07-29
*/
var request = require('request'),   
    URL_INTERFACELIFE = 'https://interfacelift.com/wallpaper/downloads/date/wide_16:10/';

/* 開啓數據採集器 */
 function dataCollectorStartup() {         
     dataRequest(URL_INTERFACELIFE);
 }

/* 數據請求 */
function dataRequest(dataUrl)
{
    request({
        url: dataUrl,    
        method: 'GET'
    }, function(err, res, body) { 
        if (err) {            
            console.log(dataUrl)
            console.error('[ERROR]Collection' + err);        
            return;            
        }    

        console.info(body);
        
    });    
}

dataCollectorStartup();

  執行會發現返回body中什麼也沒有^_^!

  加入一些代碼再看看

/*
* 功能:   請求HTTPS
* 建立人: Wilson
* 時間:   2015-07-29
*/
var request = require('request'),   
    URL_INTERFACELIFE = 'https://interfacelift.com/wallpaper/downloads/date/wide_16:10/';

/* 開啓數據採集器 */

...

/* 數據請求 */
function dataRequest(dataUrl)
{
    request({
        url: dataUrl,    
        method: 'GET',
        headers: {
            'User-Agent': 'wilson'
            }
    }, function(err, res, body) { 
        if (err) {            
            console.log(dataUrl)
            console.error('[ERROR]Collection' + err);        
            return;            
        }    

        console.info(body);
        
    });    
}

...

  再執行,你會發現body中返回請求的HTML!(結果就不放上來了,自已執行一下!)

  詳細的請看:https://github.com/request/request#custom-http-headers  

寫在以後

  request庫我仍是推薦API能夠多看看,好比Forms部分我就在實際項目測試中用的比較多!

  好比作接口測試:

  1.提交兩個參數(參數1:字符串  參數2:數字)

request.post({url:'接口URL',form: {參數一名稱:'參數一值',參數二名稱:參數二值},function(err,res,body){ 
            
            if(err)
            {
                return;
            }
            console.log(body);
});

  body就是接口返回

  2.提交一個字符串參數,提交一個文件參數(好比上傳頭像等)

var r = request.post('接口URL',function(err,res,body){             
        if(err)
        {
            return;
        }
        console.log(body);
});    
var form = r.form();
form.append('參數一名稱', '參數一值');
form.append('參數二名稱', fs.createReadStream('1.jpg'), {filename: '1.jpg'});

  cheerio庫真沒什麼好講的,會jquery就行,它庫的api基本都不用看!

 

此係列的源代碼可到http://bijian1013.iteye.com/blog/2425085下載。 

文章來源:https://www.cnblogs.com/zhongweiv/p/node_request_cheerio.html

相關文章
相關標籤/搜索