仍是參考了這篇文章:html
http://cnodejs.org/topic/54bdaac4514ea9146862abeenode
另外有上面文章 nodejs抓取網易公開課的一些經驗。緩存
代碼以下,注意其中用到了 http獲取網頁結果,request進行http請求,cheerio進行解析,mkdirp建立目錄,fs建立文件,iconv-lite進行格式轉換(此例非必須)。curl
curl.js:學習
/** * Created by baidu on 16/10/17. */ var http = require("http"); function download(url, callback) { var chunks = []; http.get(url, function(res) { res.on('data', function(chunk) { chunks.push(chunk); }); res.on('end', function () { callback(chunks); }); }).on('error', function () { callback(chunks); }) } exports.download = download;
saveimage.jsui
/** * Created by baidu on 16/10/17. */ var fs = require('fs'); var request = require('request'); var saveImage = function(url, filename) { console.log('Image=>' + url); request(url).pipe(fs.createWriteStream(filename)); console.log('Save=>' + filename); } exports.saveImage = saveImage;
HelloWorld.jsurl
/** * Created by baidu on 16/10/17. */ console.log("Hello World"); var cheerio = require('cheerio'); var curl = require('./curl'); var iconv = require('iconv-lite'); var mkdirp = require('mkdirp'); var saveimage = require('./saveimage'); //var url = 'http://open.163.com/special/opencourse/englishs1.html'; var url = 'http://loftermeirenzhi.lofter.com/tag/%E4%BA%BA%E5%83%8F?page='; var dir = './images'; mkdirp(dir, function(err) { if (err) { console.log(err); } }); curl.download(url, function (chunks) { if (chunks) { var data = iconv.decode(Buffer.concat(chunks), 'gbk'); var $ = cheerio.load(data); $('a.img').each(function (i, e) { var item = $(e).children('img').last().attr('src'); saveimage.saveImage(item, dir + '/' + item.substr(item.indexOf('.jpg')-10, 14)); }); console.log('done'); } else { console.log('error'); } });
運行以後,發現基本上下載的圖片文件都是空。spa
看了例子,將saveimage.js中的request部分作了一些修改,以下:.net
/** * Created by baidu on 16/10/17. */ var fs = require('fs'); var request = require('request'); var saveImage = function(url, filename) { console.log('Image=>' + url); request.head(url, function(err, res, body) { request(url).pipe(fs.createWriteStream(filename)); }); console.log('Save=>' + filename); } exports.saveImage = saveImage;
而後運行,成功,打印:code
/usr/local/bin/node /Users/baidu/Documents/Data/Work/Code/Self/nodejs/helloworld/HelloWorld.js Hello World Image=>http://imgsize.ph.126.net/?imgurl=http://img2.ph.126.net/CiL5IULFm0TtZBjxnhcfQQ==/52072870709354180.jpg_110x110x0x90.jpg Save=>./images/0709354180.jpg Image=>http://imglf1.nosdn.127.net/img/SzZqcDg4Rk01VGo5cW81TEorTU5zL2dCbjBLbktBODlCSkFGSXlIdEw5dEFvSDlGaTNjZmJ3PT0.jpg?imageView&thumbnail=500x0&quality=96&stripmeta=0&type=jpg Save=>./images/TNjZmJ3PT0.jpg ...... done
而後項目目錄中,生成了images目錄,其中有美女圖片:
對上面這個改動能起到效果,還不是特別明白。(head通常用來判斷url是否有效。)
加了head成功,也有多是由於第一次圖片雖然沒下載成功,可是已經啓動下載,作了緩存。實驗了一下,在成功一次以後,把head命令去掉:
//request.head(url, function(err, res, body) {
request(url).pipe(fs.createWriteStream(filename));
//});
發現仍是可以成功。因此有很大多是圖片加載延遲形成。
有時間的時候,要看一下,怎樣避免圖片下載超時致使下載失敗的問題,有沒有設置超時的地方。
好像在request初始化的時候,能夠設置:
request({ url: jurl, gzip: true, timeout: xxx })
後面再學習 Javascript Request 以及 渲染的一些內容。尤爲是 phantomjs 渲染動態網頁的方式。