利用PhantomJS作網頁截圖經濟適用,但其API較少,作其餘功能就比較吃力了。例如,其自帶的Web Server Mongoose最高只能同時支持10個請求,期望他能獨立成爲一個服務是不怎麼實際的。因此這裏須要另外一個語言來支撐服務,這裏選用NodeJS來完成。
$ phantomjs
var webpage = require('webpage') , page = webpage.create(); page.viewportSize = { width: 1024, height: 800 }; page.clipRect = { top: 0, left: 0, width: 1024, height: 800 }; page.settings = { javascriptEnabled: false, loadImages: true, userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0' }; page.open('http://www.baidu.com', function (status) { var data; if (status === 'fail') { console.log('open page fail!'); } else { page.render('./snapshot/test.png'); } // release the memory page.close(); });
這裏咱們設置了窗口大小爲1024 * 800:
page.viewportSize = { width: 1024, height: 800 };
截取從(0, 0)爲起點的1024 * 800大小的圖像:
page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };
禁止Javascript,容許圖片載入,並將userAgent改成"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0":
page.settings = { javascriptEnabled: false, loadImages: true, userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0' };
phantomjs snapshot.js http://www.baidu.com
值得注意的是PhantomJS 1.9.0支持Websocket了,不過惋惜是hixie-76 Websocket,不過畢竟仍是提供了一種NodeJS主動向PhantomJS通信的方案了。
I will answer that question with a question. How do you communicate with a process that doesn't support shared memory, sockets, FIFOs, or standard input?
Well, there's one thing PhantomJS does support, and that's opening webpages. In fact, it's really good at opening web pages. So we communicate with PhantomJS by spinning up an instance of ExpressJS, opening Phantom in a subprocess, and pointing it at a special webpage that turns socket.io messages into
calls. Thosealert()
calls are picked up by Phantom and there you go!The communication itself happens via James Halliday's fantastic dnode library, which fortunately works well enough when combined with browserify to run straight out of PhantomJS's pidgin Javascript environment.
module.exports = (function () { "use strict" var cluster = require('cluster') , fs = require('fs'); if(!fs.existsSync('./snapshot')) { fs.mkdirSync('./snapshot'); } if (cluster.isMaster) { cluster.fork(); cluster.on('exit', function (worker) { console.log('Worker' + worker.id + ' died :('); process.nextTick(function () { cluster.fork(); }); }) } else { require('./extract.js'); } })();
module.exports = (function () { "use strict" var connect = require('connect') , fs = require('fs') , spawn = require('child_process').spawn , jobMan = require('./lib/jobMan.js') , bridge = require('./lib/bridge.js') , pkg = JSON.parse(fs.readFileSync('./package.json')); var app = connect() .use(connect.logger('dev')) .use('/snapshot', connect.static(__dirname + '/snapshot', { maxAge: pkg.maxAge })) .use(connect.bodyParser()) .use('/bridge', bridge) .use('/api', function (req, res, next) { if (req.method !== "POST" || !req.body.campaignId) return next(); if (!req.body.urls || !req.body.urls.length) return jobMan.watch(req.body.campaignId, req, res, next); var campaignId = req.body.campaignId , imagesPath = './snapshot/' + campaignId + '/' , urls = [] , url , imagePath; function _deal(id, url, imagePath) { // just push into urls list urls.push({ id: id, url: url, imagePath: imagePath }); } for (var i = req.body.urls.length; i--;) { url = req.body.urls[i]; imagePath = imagesPath + i + '.png'; _deal(i, url, imagePath); } jobMan.register(campaignId, urls, req, res, next); var snapshot = spawn('phantomjs', ['snapshot.js', campaignId]); snapshot.stdout.on('data', function (data) { console.log('stdout: ' + data); }); snapshot.stderr.on('data', function (data) { console.log('stderr: ' + data); }); snapshot.on('close', function (code) { console.log('snapshot exited with code ' + code); }); }) .use(connect.static(__dirname + '/html', { maxAge: pkg.maxAge })) .listen(pkg.port, function () { console.log('listen: ' + 'http://localhost:' + pkg.port); }); })();
module.exports = (function () { "use strict" var jobMan = require('./jobMan.js') , fs = require('fs') , pkg = JSON.parse(fs.readFileSync('./package.json')); return function (req, res, next) { if (req.headers.secret !== pkg.secret) return next(); // Snapshot APP can post url information if (req.method === "POST") { var body = JSON.parse(JSON.stringify(req.body)); jobMan.fire(body); res.end(''); // Snapshot APP can get the urls should extract } else { var urls = jobMan.getUrls(req.url.match(/campaignId=([^&]*)(\s|&|$)/)[1]); res.writeHead(200, {'Content-Type': 'application/json'}); res.statuCode = 200; res.end(JSON.stringify({ urls: urls })); } }; })();
若是request method爲POST,則咱們認爲PhantomJS正在給咱們推送job的相關信息。而爲GET時,則認爲其要獲取job的信息。
module.exports = (function () { "use strict" var fs = require('fs') , fetch = require('./fetch.js') , _jobs = {}; function _send(campaignId){ var job = _jobs[campaignId]; if (!job) return; if (job.waiting) { job.waiting = false; clearTimeout(job.timeout); var finished = (job.urlsNum === job.finishNum) , data = { campaignId: campaignId, urls: job.urls, finished: finished }; job.urls = []; var res = job.res; if (finished) { _jobs[campaignId] = null; delete _jobs[campaignId] } res.writeHead(200, {'Content-Type': 'application/json'}); res.statuCode = 200; res.end(JSON.stringify(data)); } } function register(campaignId, urls, req, res, next) { _jobs[campaignId] = { urlsNum: urls.length, finishNum: 0, urls: [], cacheUrls: urls, res: null, waiting: false, timeout: null }; watch(campaignId, req, res, next); } function watch(campaignId, req, res, next) { _jobs[campaignId].res = res; // 20s timeout _jobs[campaignId].timeout = setTimeout(function () { _send(campaignId); }, 20000); } function fire(opts) { var campaignId = opts.campaignId , job = _jobs[campaignId] , fetchObj = fetch(opts.html); if (job) { if (+opts.status && fetchObj.title) { job.urls.push({ id: opts.id, url: opts.url, image: opts.image, title: fetchObj.title, description: fetchObj.description, status: +opts.status }); } else { job.urls.push({ id: opts.id, url: opts.url, status: +opts.status }); } if (!job.waiting) { job.waiting = true; setTimeout(function () { _send(campaignId); }, 500); } job.finishNum ++; } else { console.log('job can not found!'); } } function getUrls(campaignId) { var job = _jobs[campaignId]; if (job) return job.cacheUrls; } return { register: register, watch: watch, fire: fire, getUrls: getUrls }; })();
module.exports = (function () { "use strict" return function (html) { if (!html) return { title: false, description: false }; var title = html.match(/\<title\>(.*?)\<\/title\>/) , meta = html.match(/\<meta\s(.*?)\/?\>/g) , description; if (meta) { for (var i = meta.length; i--;) { if(meta[i].indexOf('name="description"') > -1 || meta[i].indexOf('name="Description"') > -1){ description = meta[i].match(/content\=\"(.*?)\"/)[1]; } } } (title && title[1] !== '') ? (title = title[1]) : (title = 'No Title'); description || (description = 'No Description'); return { title: title, description: description }; }; })();
var webpage = require('webpage') , args = require('system').args , fs = require('fs') , campaignId = args[1] , pkg = JSON.parse(fs.read('./package.json')); function snapshot(id, url, imagePath) { var page = webpage.create() , send , begin , save , end; page.viewportSize = { width: 1024, height: 800 }; page.clipRect = { top: 0, left: 0, width: 1024, height: 800 }; page.settings = { javascriptEnabled: false, loadImages: true, userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/1.9.0' }; page.open(url, function (status) { var data; if (status === 'fail') { data = [ 'campaignId=', campaignId, '&url=', encodeURIComponent(url), '&id=', id, '&status=', 0 ].join(''); postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {}); } else { page.render(imagePath); var html = page.content; // callback NodeJS data = [ 'campaignId=', campaignId, '&html=', encodeURIComponent(html), '&url=', encodeURIComponent(url), '&image=', encodeURIComponent(imagePath), '&id=', id, '&status=', 1 ].join(''); postMan.post(data); } // release the memory page.close(); }); } var postMan = { postPage: null, posting: false, datas: [], len: 0, currentNum: 0, init: function (snapshot) { var postPage = webpage.create(); postPage.customHeaders = { 'secret': pkg.secret }; postPage.open('http://localhost:' + pkg.port + '/bridge?campaignId=' + campaignId, function () { var urls = JSON.parse(postPage.plainText).urls , url; this.len = urls.length; if (this.len) { for (var i = this.len; i--;) { url = urls[i]; snapshot(url.id, url.url, url.imagePath); } } }); this.postPage = postPage; }, post: function (data) { this.datas.push(data); if (!this.posting) { this.posting = true; this.fire(); } }, fire: function () { if (this.datas.length) { var data = this.datas.shift() , that = this; this.postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () { that.fire(); // kill child process setTimeout(function () { if (++this.currentNum === this.len) { that.postPage.close(); phantom.exit(); } }, 500); }); } else { this.posting = false; } } }; postMan.init(snapshot);