關於phantomjs爬取須要登陸頁面並截圖(頁面包含一些異步請求的數據)

項目有個需求是須要捕獲某個頁面的數據(後端完成),由於以前用過phantom,因此就堅決果斷的選擇了它,關於phantom的介紹,安裝和簡單使用百度很容易找到,這裏就再也不贅述了。 以後就開始大刀闊斧的碼起來了,興致沖沖的利用網上找到的截取某網頁的代碼(見附錄1)測試。web

'use strict';

var page = require('webpage').create(), 
	system = require('system'), 
	args = system.args, 
	page_url = 'http://www.oschina.net/',
	filename = '../phantomjs_temp/capture0.png';

	
function capture(url,filename,callback){
	console.log("ready to capture");
	page.open(url,function(status){
		if("success" === status){
			console.log("open page succeed");
			onPageReady(url,filename,callback);
		}else{
			console.log("open page failed");
			closePhantom();
		}
		
	});
}

function onPageReady(url,filename,callback){
	page.render(filename);
	closePhantom();
}

function closePhantom(){
	console.log("page is closing...");
	page.close();
	console.log("phantom is closing...");
	phantom.exit(1);
}

capture(page_url,filename);

沒問題。而後發現了問題, 我oschina明明登陸了,爲何這裏是未登陸狀態呢(由於項目需求截圖的頁面也有權限驗證),查閱相關資料以後,找到了解決方案 方案1:將用戶登陸的cookie加入到phantomjs中(登陸oschina後打開控制檯,查看請求裏面的cookie,有一條是oscid的) 以下圖 輸入圖片說明 代碼以下後端

'use strict';

var page = require('webpage').create(), 
	system = require('system'), 
	args = system.args, 
	page_url = 'http://www.oschina.net/',
	filename = '../phantomjs_temp/capture1.png';

	
function capture(url,filename,callback){
	console.log("ready to capture");
	page.open(url,function(status){
		if("success" === status){
			console.log("open page succeed");
			onPageReady(url,filename,callback);
		}else{
			console.log("open page failed");
			closePhantom();
		}
		
	});
}

function onPageReady(url,filename,callback){
	page.render(filename);
	closePhantom();
}

function closePhantom(){
	console.log("page is closing...");
	page.close();
	console.log("phantom is closing...");
	phantom.exit(1);
}

phantom.addCookie({"name":"oscid","value":"mljV7ERwRhP3eH62HnFisZP1qaXlr2txLKufSq%2FUuhCTXQq%2B1RKVm0vp96Iu7MfX6O9lOOYfQG3DmlglDvlk8YvI0DSaPefEGJtGLkSfdZQ%2F5qN340KTUg0PiaZwDvHaucuWHExhfuavuZfodZNJKtGWRFkZxL6V","domain":'www.oschina.net'});

capture(page_url,filename);

運行,binggo,完成。 方案2:打開oschina的登陸頁面,用phantom模擬登錄過程,而後截圖 依然沒問題api

'use strict';

var page = require('webpage').create(), 
	system = require('system'), 
	args = system.args, 
	page_url = 'http://www.oschina.net/',
	login_url = 'https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F',
	filename = '../phantomjs_temp/capture2.png';

function login(){
	page.open(login_url,function(status){
		if("success" === status){
			page.evaluate(function(){
				document.querySelector("#userMail").value = 'your user name';
				document.querySelector("#userPassword").value = 'your password';
				document.querySelector(".btn-login").click();
			});
			setTimeout('print_cookies()',15000);
		}
	});
}

function capture(url,filename,callback){
	console.log("ready to capture");
	page.open(url,function(status){
		if("success" === status){
			console.log("open page succeed");
			onPageReady(url,filename,callback);
		}else{
			console.log("open page failed");
			closePhantom();
		}
		
	});
}

function print_cookies(){
	console.log("running print_cookies");
	for(var i in page.cookies){
		console.log(JSON.stringify(page.cookies[i]));
	}
	capture(page_url,filename);
}


function onPageReady(url,filename,callback){
	page.render(filename);
	closePhantom();
}

function closePhantom(){
	console.log("page is closing...");
	page.close();
	console.log("phantom is closing...");
	phantom.exit(1);
}

login();

這裏已經完成了一大筆工做了,長長的出口氣吧~服務器

可是我項目裏面有另一個問題就是異步請求特別多,截圖的時候雖然頁面加載完成了,可是部分異步請求數據還沒返回,沒有渲染到頁面裏,因此截圖會有部分loading。。cookie

找了不少資料,有個拙劣的解決方法,就是在截圖前在wait一段時間(本身根據實際狀況約定,幾秒到幾分鐘均可以),可是這明顯不合理,時間定的過短,可能仍是有上面的問題,定的太長,可能頁面在就等着你截圖了,你還在那傻傻的wait,多不合適啊。最合適的不過頁面全部資源和元素都完成了返回和渲染的時刻,這個時刻怎麼獲得呢。dom

這時候就發現百度好坑。搜索的結果全是重複的,還不能解決這個問題 因此我就把目光放到了QQ羣裏,這裏感謝highchart中文站長的幫助,他告訴我能夠用document.readyState是否是等於‘complete’來判斷,測試一下確實能夠啦 然而屢次測試仍是存在巧合,當我在服務器端將異步請求的處理方法增長sleep阻塞後,這部分就又回到了loading狀態,氣氣氣氣氣。。。異步

屢次翻閱資料,終於在stackoverflow上面找到個相似的問題 http://stackoverflow.com/questions/11340038/phantomjs-not-waiting-for-full-page-load測試

最後Dave的方法解決了個人問題,就是用page.onResourceReceived 和 page.onResourceRequested 一個是page發送請求執行的callback 一個是page接收到返回執行的callback API:http://phantomjs.org/api/webpage/ui

每次requested的時候增長一個請求,每次received的時候減小一次請求,當全部請求都獲得反饋了,那麼他們差值不就是0了嗎?this

懷着忐忑的心情測試瞭如下,oh,yeah!終於解決了,代碼以下

var page = require('webpage').create(), 
	system = require('system'), 
	args = system.args, 
	page_url = 'url***********',
	filename = '../phantomjs_temp/'+Math.random()+'.png',
	countTotal = 1000,
	seconds = 1000,
	requestIDArr = [];

function capture(url,filename,callback){
	console.log("ready to capture");
	page.open(url,function(status){
		if("success" === status){
			console.log("open page succeed");
			checkReadyState(url,filename,callback);
		}else{
			console.log("open page failed");
			closePhantom();
		}
		
	});
}

function checkReadyState(url,filename,callback,count){
	var count = count || 0;
	console.log("this is the "+count+"time check ready state");
	var timeout = setTimeout(function(){		
		if(requestIDArr.length==0){
			onPageReady(url,filename,callback);
		}else{
			console.log("still waiting for resoinse id is "+requestIDArr.join(","))
			if(count>countTotal){
				clearTimeout(timeout);
				console.log("has tryed "+(countTotal*seconds/1000)+" seconds,but still failed get correct data");
				closePhantom();
				return false;
			}
			count++;
			checkReadyState(url,filename,callback,count);
		}
	},seconds);
}

function onPageReady(url,filename,callback){//頁面徹底加載完了(包含異步請求的數據的渲染也完成了)
	var scroll = page.evaluate(function(){
		var mainDiv = document.querySelector(".main");
		return {"height":mainDiv.scrollHeight,"width":mainDiv.scrollWidth};
	});
	page.clipRect.height = scroll.height || page.clipRect.height;
	page.clipRect.width = scroll.width || page.clipRect.width;
	page.viewportSize.width = scroll.width || page.viewportSize.width;
	
	page.render(filename);
	
	closePhantom();
}

function closePhantom(){
	console.log("page is closing...");
	page.close();
	console.log("phantom is closing...");
	phantom.exit(1);
}

page.viewportSize = {
  width: 400,
  height: 550
};

page.clipRect = {
  top: 95,
  left: 191,
  width: 1100,
  height: 2200
};

page.onResourceRequested = function (request) {
	requestIDArr.push(request.id);
	console.log("add is ",request.id);
};
page.onResourceReceived = function (response) {
	spliceRequestID(response.id);
};

function spliceRequestID(id){
	var spliceTimeout = setTimeout(function(){
		var index = requestIDArr.indexOf(id);
		if(index>=0){
			requestIDArr.splice(index,1);
			console.log("delete is ",id);
		}else{
			spliceRequestID(id);
		}
	},100);
}

phantom.addCookie({"name":"JSESSIONID","value":"00AF0CF1FB333A5268A9CD5C8FF0487A","domain":'192.168.12.35','path':'/local_adreport/'});

capture(page_url,filename);

至此,整個探究就結束了,可能後面還會遇到其餘問題,可是同樣須要耐心解決;

相關文章
相關標籤/搜索