Phantomjs 根據Casperjs源碼拓展download方法

最近項目在使用Phantomjs做自動化檢測時,有一個需求,須要下載檢測網站的全部資源,包括css、js和圖片資源,方便人工分析時能夠把整個page還原。惋惜,Phantomjs並無直接提供download()這樣的方法。查找資料後發現Casperjs有一個download的方法,能夠把任意url的內容下載爲字符串。因爲Casperjs是根據Phantomjs開發的,所以從Casperjs的源碼上分析,可能會獲得一點啓發。css

 

目的:根據Casperjs源碼,拓展Phantomjs,添加download方法
html

 

1. 先測試Casperjs的download方法[1]es6

 1 var casper = require('casper').create({  2  pageSettings : {  3         webSecurityEnabled: false
 4  }  5 });  6 
 7 casper.start('http://www.baidu.com/', function() {  8     this.download('http://www.w3school.com.cn/', 'w3school.html');  9 }); 10 
11 casper.run();

保存爲D:/script.js,在命令行執行(casperjs D:/script.js)。Casperjs須要Phantomjs,請確保已安裝Phantomjs v1.x版本。web

 

2. 分析Casperjs源碼ajax

download方法在casper模塊裏,打開源碼包下modules/casper.js,先找到download這個方法體(#592行)express

 1 /**  2  * Downloads a resource and saves it on the filesystem.  3  *  4  * @param String url The url of the resource to download  5  * @param String targetPath The destination file path  6  * @param String method The HTTP method to use (default: GET)  7  * @param String data Optional data to pass performing the request  8  * @return Casper  9  */
10 Casper.prototype.download = function download(url, targetPath, method, data) { 11     "use strict"; 12     this.checkStarted();    //在#426行,檢查this是否已啓動
13     var cu = require('clientutils').create(utils.mergeObjects({}, this.options)); 14     try { 15         fs.write(targetPath, cu.decode(this.base64encode(url, method, data)), 'wb'); 16         this.emit('downloaded.file', targetPath); 17         this.log(f("Downloaded and saved resource in %s", targetPath)); 18     } catch (e) { 19         this.log(f("Error while downloading %s to %s: %s", url, targetPath, e), "error"); 20  } 21     return this; 22 };

上面源碼中,cu爲'clientutils'模塊的實例,用於decode(),具體功能後面再講述。第#16行中,emit()在events模塊中(與this綁定的語句在源碼#226行),功能爲發送日誌廣播之類,與下面的this.log()同樣,對download功能沒大影響。所以核心語句在fs.write()中,url的內容在this.base64encode中獲取。api

再找base64encode這個方法,在源碼#255行,返回callUtils('getBase64', url, method, data)。callUtils在#283行。跨域

 1 /**  2  * Invokes a client side utils object method within the remote page, with arguments.  3  *  4  * @param {String} method Method name  5  * @return {...args} Arguments  6  * @return {Mixed}  7  * @throws {CasperError} If invokation failed.  8  */
 9 Casper.prototype.callUtils = function callUtils(method) { 10     "use strict"; 11     var args = [].slice.call(arguments, 1); //把除method外的其他參數存到args
12     var result = this.evaluate(function(method, args) { 13         return __utils__.__call(method, args); 14  }, method, args); 15     if (utils.isObject(result) && result.__isCallError) { 16         throw new CasperError(f("callUtils(%s) with args %s thrown an error: %s", 17  method, args, result.message)); 18  } 19     return result; 20 };

此時的method的值爲「getBase64」,估計是一個方法名。這個方法核心語句在this.evaluate(),具體執行爲this.evaluate(fn, "getBase64", [url, method, data])。evaluate()在#689行。app

 1 /**  2  * Evaluates an expression in the page context, a bit like what  3  * WebPage#evaluate does, but the passed function can also accept  4  * parameters if a context Object is also passed:  5  *  6  * casper.evaluate(function(username, password) {  7  * document.querySelector('#username').value = username;  8  * document.querySelector('#password').value = password;  9  * document.querySelector('#submit').click(); 10  * }, 'Bazoonga', 'baz00nga'); 11  * 12  * @param Function fn The function to be evaluated within current page DOM 13  * @param Object context Object containing the parameters to inject into the function 14  * @return mixed 15  * @see WebPage#evaluate 16  */
17  //實際執行evaluate(fn, 'getBase64', [url, method, data])
18  //即context='getBase64', arguments.length=3
19 Casper.prototype.evaluate = function evaluate(fn, context) { 20     "use strict"; 21     this.checkStarted(); 22     console.log("context:"+context); 23     
24     if (!utils.isFunction(fn) && !utils.isString(fn)) {
25         throw new CasperError("evaluate() only accepts functions or strings"); 26  } 27     
28     this.injectClientUtils();       //注入clientutils.js,稍後再細看
29     
30     if (arguments.length === 1) { 31         return utils.clone(this.page.evaluate(fn)); 32     } else if (arguments.length === 2) { 33         // check for closure signature if it matches context
34         if (utils.isObject(context) && eval(fn).length === Object.keys(context).length) { 35             context = utils.objectValues(context); 36         } else { 37             context = [context]; 38  } 39     } else {        //arguments.length==3,實際執行到這裏
40         // phantomjs-style signature
41         context = [].slice.call(arguments).slice(1); 42  } 43     //此時context = ['getBase64', [url, method, data]]
44     //[fn].concat(context) = [fn, 'getBase64', [url, method, data]]
45     return utils.clone(this.page.evaluate.apply(this.page, [fn].concat(context))); 46 };

以上第#28行注入了clientutils.js,具體實現方法下面再分析。第#17和#18行說明調用本方法時的參數狀況,根據參數個數,實際執行到#39行,詳細說明在#43和#44行的註釋。所以,#45行至關於執行this.page.evaluate(fn, 'getBase64', [url, method, data])。fn在callUtils中定義了,最終效果至關於:async

1 this.page.evaluate(function(method, args) { 2     return __utils__.__call(method, args); 3 }, 'getBase64', [url, method, data])

其中,function中的method='getBaes64',args=[url, method, data]。因此最後,這句的意義等於在page中注入腳本執行__utils__.__call('getBase64', [url, method, data])。

再回頭看,__utils__對象在以上#28行this.injectClientUtils()中注入的,injectClientUtils在#1256行。

 1 /**  2  * Injects Client-side utilities in current page context.  3  *  4  */
 5 Casper.prototype.injectClientUtils = function injectClientUtils() {  6     "use strict";  7     this.checkStarted();  8     //保證不重複注入
 9     var clientUtilsInjected = this.page.evaluate(function() { 10         return typeof __utils__ === "object"; 11  }); 12     if (true === clientUtilsInjected) { 13         return; 14  } 15     var clientUtilsPath = require('fs').pathJoin(phantom.casperPath, 'modules', 'clientutils.js'); 16     if (true === this.page.injectJs(clientUtilsPath)) { 17         this.log("Successfully injected Casper client-side utilities", "debug"); 18     } else { 19         this.warn("Failed to inject Casper client-side utilities"); 20  } 21     // ClientUtils and Casper shares the same options
22     // These are not the lines I'm the most proud of in my life, but it works.
23     /*global __options*/
24     this.page.evaluate(function() { 25         window.__utils__ = new window.ClientUtils(__options); 26     }.toString().replace('__options', JSON.stringify(this.options))); 27 };

以上代碼很好解釋。先檢查有沒有__utils__對象,若是有說明已經注入clientutils了。若沒有則注入clientutils.js,並新建ClientUtils對象,取名爲__utils__。所以,下一步應該看clientutils.js。

在clientutils.js中,找到__call方法,在#70行。

 1 /**  2  * Calls a method part of the current prototype, with arguments.  3  *  4  * @param {String} method Method name  5  * @param {Array} args arguments  6  * @return {Mixed}  7  */
 8 this.__call = function __call(method, args) {  9     if (method === "__call") { 10         return; 11  } 12     try { 13         return this[method].apply(this, args); 14     } catch (err) { 15         err.__isCallError = true; 16         return err; 17  } 18 };

核心在#13行,很好理解,即執行method指定的方法,並返回結果。回顧上面,method爲'getBase64',所以再找到getBase64方法,在#364行,其引用的getBinary()在下一個方法。getBinary()引用this.sendAJAX()。

至此整個下載過程的原理已經很清楚了,就是在page中注入腳本,利用跨域同步AJAX取得指定url的內容,而後再返回給Casperjs。sendAJAX則新建XMLHttpRequest來發出請求,這裏不詳細講解。

 

3. 拓展download模塊

首先模仿clientutils封裝client模塊,保存爲modules/client.js。

 1 /*
 2  * 用於phantomjs引用或注入page  3  */
 4 (function(exports) {  5     "use strict";  6 
 7     exports.create = function create() {  8         return new this.Client();  9  }  10 
 11     exports.Client = function Client() {  12         var BASE64_ENCODE_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";  13         var BASE64_DECODE_CHARS = new Array(  14             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  15             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  16             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,  17             52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,  18             -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  19             15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,  20             -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,  21             41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1
 22  );  23 
 24         /**  25  * Performs an AJAX request.  26  *  27  * @param String url Url.  28  * @param String method HTTP method (default: GET).  29  * @param Object data Request parameters.  30  * @param Boolean async Asynchroneous request? (default: false)  31  * @param Object settings Other settings when perform the ajax request  32  * @return String Response text.  33          */
 34         this.sendAJAX = function sendAJAX(url, method, data, async, settings) {  35             var xhr = new XMLHttpRequest(),  36                 dataString = "",  37                 dataList = [];  38             method = method && method.toUpperCase() || "GET";  39             var contentType = settings && settings.contentType || "application/x-www-form-urlencoded";  40             xhr.open(method, url, !!async);  41             xhr.overrideMimeType("text/plain; charset=x-user-defined");  42             if (method === "POST") {  43                 if (typeof data === "object") {  44                     for (var k in data) {  45                         dataList.push(encodeURIComponent(k) + "=" + encodeURIComponent(data[k].toString()));  46  }  47                     dataString = dataList.join('&');  48                 } else if (typeof data === "string") {  49                     dataString = data;  50  }  51                 xhr.setRequestHeader("Content-Type", contentType);  52  }  53             xhr.send(method === "POST" ? dataString : null);  54             return this.encode(xhr.responseText);  55  };  56 
 57         /**  58  * Base64 encodes a string, even binary ones. Succeeds where  59  * window.btoa() fails.  60  *  61  * @param String str The string content to encode  62  * @return string  63          */
 64         this.encode = function encode(str) {  65             /*jshint maxstatements:30 */
 66             var out = "", i = 0, len = str.length, c1, c2, c3;  67             while (i < len) {  68                 c1 = str.charCodeAt(i++) & 0xff;  69                 if (i === len) {  70                     out += BASE64_ENCODE_CHARS.charAt(c1 >> 2);  71                     out += BASE64_ENCODE_CHARS.charAt((c1 & 0x3) << 4);  72                     out += "==";  73                     break;  74  }  75                 c2 = str.charCodeAt(i++);  76                 if (i === len) {  77                     out += BASE64_ENCODE_CHARS.charAt(c1 >> 2);  78                     out += BASE64_ENCODE_CHARS.charAt(((c1 & 0x3)<< 4) | ((c2 & 0xF0) >> 4));  79                     out += BASE64_ENCODE_CHARS.charAt((c2 & 0xF) << 2);  80                     out += "=";  81                     break;  82  }  83                 c3 = str.charCodeAt(i++);  84                 out += BASE64_ENCODE_CHARS.charAt(c1 >> 2);  85                 out += BASE64_ENCODE_CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));  86                 out += BASE64_ENCODE_CHARS.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));  87                 out += BASE64_ENCODE_CHARS.charAt(c3 & 0x3F);  88  }  89             return out;  90  };  91 
 92         /**  93  * Decodes a base64 encoded string. Succeeds where window.atob() fails.  94  *  95  * @param String str The base64 encoded contents  96  * @return string  97          */
 98         this.decode = function decode(str) {  99             /*jshint maxstatements:30, maxcomplexity:30 */
100             var c1, c2, c3, c4, i = 0, len = str.length, out = ""; 101             while (i < len) { 102                 do { 103                     c1 = BASE64_DECODE_CHARS[str.charCodeAt(i++) & 0xff]; 104                 } while (i < len && c1 === -1); 105                 if (c1 === -1) { 106                     break; 107  } 108                 do { 109                     c2 = BASE64_DECODE_CHARS[str.charCodeAt(i++) & 0xff]; 110                 } while (i < len && c2 === -1); 111                 if (c2 === -1) { 112                     break; 113  } 114                 out += String.fromCharCode((c1 << 2) | ((c2 & 0x30) >> 4)); 115                 do { 116                     c3 = str.charCodeAt(i++) & 0xff; 117                     if (c3 === 61) 118                     return out; 119                     c3 = BASE64_DECODE_CHARS[c3]; 120                 } while (i < len && c3 === -1); 121                 if (c3 === -1) { 122                     break; 123  } 124                 out += String.fromCharCode(((c2 & 0XF) << 4) | ((c3 & 0x3C) >> 2)); 125                 do { 126                     c4 = str.charCodeAt(i++) & 0xff; 127                     if (c4 === 61) { 128                         return out; 129  } 130                     c4 = BASE64_DECODE_CHARS[c4]; 131                 } while (i < len && c4 === -1); 132                 if (c4 === -1) { 133                     break; 134  } 135                 out += String.fromCharCode(((c3 & 0x03) << 6) | c4); 136  } 137             return out; 138  }; 139  }; 140 })(typeof exports === 'object' ? exports : window);

 封裝download模塊,保存爲modules/download.js

 1 /*
 2  * 拓展模塊,添加使用GET/POST下載資源的方法  3  */
 4 exports.create = function create(page) {  5     return new this.Casper(page);  6 }  7 
 8 exports.Casper = function Casper(page) {  9     this.page = page; 10     this.fs = require('fs'); 11     //client.js模塊所在路徑
12     this.clientPath = this.fs.absolute(require('system').args[0]) + '/../modules/client.js'; 13     this.client = require(this.clientPath).create(); 14 
15     this.get = function get(url, targetPath) { 16         this.injectClientJs();  //注入client.js
17         var content = this.page.evaluate(function(url) { 18             return __utils__.sendAJAX(url); 19  }, url); 20         this.fs.write(targetPath, this.client.decode(content), 'wb'); 21  } 22 
23     this.post = function post(url, data, targetPath) { 24         this.injectClientJs();  //注入client.js
25         var content = this.page.evaluate(function(url, data) { 26             return __utils__.sendAJAX(url, 'POST', data); 27  }, url, data); 28         this.fs.write(targetPath, this.client.decode(content), 'wb'); 29  } 30 
31     this.injectClientJs = function injectClientJs() { 32         "use strict"; 33         //避免重複注入
34         var isJsInjected = this.page.evaluate(function() { 35             return typeof __utils__ === 'object'; 36  }); 37         if (true === isJsInjected) { 38             return ; 39  } 40         if (true !== this.page.injectJs(this.clientPath)) { 41             console.log('WARNING: Failed to inject client module!'); 42  } 43         this.page.evaluate(function() { 44             window.__utils__ = new window.Client(); //新建Client對象
45  }); 46  }; 47 };

寫一份測試腳本保存爲script.js。腳本路徑與modules文件夾同級,假設分別爲D:/script.js和D:/modules/。

 1 var fs = require('fs');  2 //切換至當前腳本路徑下,方便引入自定義模塊
 3 var isChangeDirSuccees = fs.changeWorkingDirectory(fs.absolute(require('system').args[0]) + '/../');  4 if (!isChangeDirSuccees) {  5     console.log('ERROR: Failed to change working directory!');  6  phantom.exit();  7 }  8 
 9 var page = require('webpage').create(); 10 page.open('http://www.w3school.com.cn/', function(status) { 11     var download = require('./modules/download').create(page); 12     download.get('http://www.w3school.com.cn/i/site_photoref.jpg', 'photo.jpg'); 13     console.log('LOG: Download Completed!'); 14  phantom.exit(); 15 });

以上代碼,先訪問w3school主頁,再下載site_photoref.jpg圖片,保存在photo.jpg中。

通過測試,download可下載全部類型的資源,包括壓縮文件、APK。可是注意一點,因爲同源策略,當執行跨域請求時(page.open和download的url不在同域下),要把web-security設爲false[2],在命令行啓動時輸入:phantomjs --web-security=false script.js。

 

參考資料及引用:

[1] download方法例子:Casper官網. Casperjs Api.
http://docs.casperjs.org/en/latest/modules/casper.html#download

[2] web-security:Phantomjs官網. 命令行選項.
http://phantomjs.org/api/command-line.html

相關文章
相關標籤/搜索