nodejs 下載網頁及相關資源文件

功能其實很見簡單,經過 phantomjs.exe 採集 url 加載的資源,經過子進程的方式,啓動nodejs 加載全部的資源,對於css的資源,匹配css內容,下載裏面的url資源css

固然功能仍是很簡單的,在響應式設計和異步加載的狀況下,仍是有不少資源沒有可以下載,須要根據實際狀況處理下html

 首先固然是下載 nodejs 和 phantomjsnode

下面是 phantomjs.exe 執行的 down.jsweb

var page = require('webpage').create(),
    system = require('system');
var spawn = require("child_process").spawn

if (system.args.length === 1) {
    console.log('Usage: netsniff.js <some URL>');
    phantom.exit(1);
} else {
    var urls = [];
    page.address = system.args[1];
    page.onResourceReceived = function (res) {
        if (res.stage === 'start') {
            urls.push(res.url);
        }
    };
    page.open(page.address, function (status) {
        var har;
        if (status !== 'success') {
            console.log('FAIL to load the address');
            phantom.exit(1);
        } else {
            console.log('down resource ' + urls.length + ' urls.');
            var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')])
            child.stdout.on("data", function (data) {
              console.log(data);
            })
            child.stderr.on("data", function (data) {
              console.log(data);
            })
            child.on("exit", function (code) {
              phantom.exit();
            })            
        }
    });
}

下面是對應的node運行的 downHtml.jswindows

"use strict";
var fs = require('fs');
var http = require('http');
var path = require('path');
var r_url = require('url');

var dirCache = {};//緩存減小判斷
function makedir (pathStr, callback) {
    if (dirCache[pathStr] == 1) {
        callback();
    } else {
        fs.exists(pathStr, function (exists) {
            if (exists == true) {
                dirCache[pathStr] == 1;
                callback();
            } else {
                makedir(path.dirname(pathStr), function () {
                    fs.mkdir(pathStr, function () {
                        dirCache[pathStr] == 1;
                        callback();
                    })
                });
            }
        })
    }
};

var reg = /[:,]\s*url\(['"]?.*?(\1)\)/g
var reg2 = /\((['"]?)(.*?)(\1)\)/
var isDownMap = {};
var downImgFromCss = function (URL) {
    http.get(URL, function(res) {
        //console.log(path.resolve(process.cwd(), 'index.min.css'))
        //res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css')));
        var body = "";
        res.setEncoding('utf8');
        res.on('data', function (chunk) {
            body += chunk;
        });
        res.on('end', function () {
            var match = body.match(reg);
            for (var i = 0, len = match.length; i < len; i++){
                var m = match[i].match(reg2);
                if (m && m[2]) {
                    var url = m[2];
                    let imgUrl = r_url.resolve(URL, url);
                    if (!isDownMap[imgUrl]) {
                        var uo = r_url.parse(imgUrl);
                        let filepath = CWD + '/' + uo.hostname + uo.pathname;
                        makedir(path.dirname(filepath), function () {
                            http.get(imgUrl, function (res) {
                                res.pipe(fs.createWriteStream(filepath));
                            })
                        })
                        isDownMap[imgUrl] = 1;
                    }
                }
            }
        });
    });
}

var URLS = process.argv[2].split(',');
var CWD = process.cwd();
//下載資源
URLS.forEach(function (URL) {
    var uo = r_url.parse(URL);
    var filepath;
    if (uo.pathname == '/' || uo.pathname == '') {
        filepath = CWD + '/' + uo.hostname + '/index.html';
    } else {
        filepath = CWD + '/' + uo.hostname + uo.pathname;
    }
    makedir(path.dirname(filepath), function () {
        http.get(URL, function (res) {
            if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {
                console.log('down images form css file:' + URL + '.');
                downImgFromCss(URL);
            }
            res.pipe(fs.createWriteStream(filepath));
        })
    });
});

 

down.js downHtml.js 放在同一個文件夾下 經過下列 cmd 運行緩存

D:\phantomjs-2.0.0-windows\bin\phantomjs.exe down.js http://www.youku.com/異步

相關文章
相關標籤/搜索