由於工做須要,用nodejs寫了個簡單的爬蟲例子,以前也沒用過nodejs,連搭環境加寫大概用了5天左右,so。。。要多簡陋有多簡陋,放這裏給之後的本身看~~javascript
總體需求是:給一個有效的URL地址,返回該網頁上全部無效連接的百分比(壞鏈率)html
第一個文件:計算環鏈率 urlSpider.js java
1 /*================================================ 2 @author MissUU 3 連接抓取思路: 4 5 1. 獲取頁面內容 6 2. 正則取得全部<a> 7 3. 進一步取得href屬性值,若是首位是「則剔除,不是http開頭加上域名(javascript開頭除外) 8 4.正則驗證是不是常見URL格式 9 ================================================*/ 10 var http = require('http'); 11 var async = require('async'); 12 var dbHandle = require('./dbHandle.js'); 13 14 //主程序 15 var runUrlSpider = function(obj, callback){ 16 //10s timeout 17 var request_timer = setTimeout(function() { 18 req.abort(); 19 console.log('Request Timeout.'); 20 }, 10000); 21 22 var urlBadLink = new UrlBadLink(); 23 var html=''; 24 var req = http.get(obj.url, function(res) { 25 26 clearTimeout(request_timer); 27 28 res.setEncoding('utf8'); 29 res.on('data', function (chunk) { 30 html += chunk; 31 }).on('end', function(){ 32 console.log('*******開始提取有效連接地址******'); 33 console.log(new Date().toLocaleString()); 34 console.log(obj.url); 35 urlBadLink.host = obj.url; 36 urlBadLink.id = obj.id; 37 matchURL(html, urlBadLink, function(){ 38 callback(); 39 }); 40 }); 41 }); 42 43 req.on('error', function(e) { 44 console.log('problem with request: ' + e.message); 45 callback(); 46 }); 47 } 48 49 //this is the entrance of code 50 var main = function(){ 51 var urlArray = dbHandle.showUrls(1, function(result){ 54 async.eachSeries(result, runUrlSpider, function(err){ 55 console.log('******this is the end, haha*******'); 56 }); 57 }); 58 // console.log(urlArray); 59 60 }; 61 62 main(); 63 64 /* 65 * 用於異步放送get請求 66 * 67 * @param {string} content 原始頁面信息 68 * @param {string} host 主域名 69 */ 70 function matchURL(content, urlBadLink, callend){ 71 var host = urlBadLink.host; 72 var anchor = /<a\s[^>]*>/g; 73 var matches = content.match(anchor); 74 var badLink = 0; 75 var flag = 0; 76 var HttpGet = function(url,callback){ 77 //10s timeout 78 var request_timer = setTimeout(function() { 79 req.abort(); 80 console.log('Request Timeout.'); 81 }, 10000); 82 83 var req = http.get(url, function(res) { 84 clearTimeout(request_timer); 85 86 res.on('data', function () { 87 }).on('end', function(){ 88 console.log(++flag + ": " + url + ' response status: ' + res.statusCode); 89 90 if(!(res.statusCode >= 200 && res.statusCode < 400)){ 91 console.log('-----------------------'); 92 badLink++; 93 } 94 95 callback(); 96 }); 97 }); 98 req.on('error', function(err){ 99 console.log(++flag + ": " + 'problem with request: ' + err.message); 100 badLink++; 101 callback(); 102 }); 103 }; 104 105 var urls = filterUrl(matches,host); 106 107 if(urls !== null){ 108 var totalLink = urls.length; 109 //console.log(urls); 110 async.eachSeries(urls, HttpGet, function(err){ 111 // var urlBadLink = new UrlBadLink(host,totalLink, badLink); 112 // console.log("壞鏈個數爲: " + urlBadLink.badCounts); 113 // console.log("壞鏈率爲: " + urlBadLink.getRate()); 114 urlBadLink.total = totalLink; 115 urlBadLink.badCounts = badLink; 116 //data store puts here 117 dbHandle.updateBadLink(urlBadLink); 118 callend(); 119 }); 120 }else{ 121 console.log('no links found'); 122 urlBadLink.total = 10; 123 urlBadLink.badCounts = 0; 124 dbHandle.updateBadLink(urlBadLink); 125 callend(); 126 } 127 } 128 129 //正則取得href屬性值 130 function URLFommat(strUrl,host){ 131 132 var urlPatten = /href=[\'\"]?([^\'\"]*)[\'\"]?/i; 133 var temp = urlPatten.exec(strUrl); 134 135 if(temp!= null){ 136 var url = temp[0].substring(6,temp[0].length-1).trim(); 137 138 if(url.indexOf("\"") != -1){ 139 url = url.slice(url.indexOf("\"") + 1); 140 } 141 142 if(url.charAt(0) == "/"){ 143 url = url.slice(1); 144 return host + url; 145 }else if((url.indexOf("http") == -1)&& 146 (url.indexOf("javascript") == -1)){ 147 return host + url; 148 }else 149 return url; 150 }else 151 return null; 152 } 153 154 176 //test URLFommat 177 //var test = "http://baidu.com"; 178 // var test1 = " \"http://baidu.com"; 179 //var test2 = "/wenhao"; 180 //console.log(URLFommat(test,"www.sina.com.cn")); 181 //console.log(URLFommat(test1,"www.sina.com.cn")); 182 //console.log(URLFommat(test2,"www.sina.com.cn")); 183 184 185 //測試是否爲常見url格式 186 function IsURL(strUrl) { 187 if(strUrl != null){ 188 var regular = /^\b(((http?|ftp):\/\/)?[-a-z0-9]+(\.[-a-z0-9]+)*\.(?:com|edu|gov|int|mil|net|org|biz|info|name|museum|asia|coop|aero|[a-z][a-z]|((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d))\b(\/[-a-z0-9_:\@&?=+,.!\/~%\$]*)?)$/i; 189 if (regular.test(strUrl)) { 190 return true; 191 } 192 else { 193 return false; 194 } 195 }else 196 return false; 197 } 198 199 200 //對象 201 function UrlBadLink(id, host, total, badCounts){ 202 this.id = id; 203 this.host = host; 204 this.total = total; 205 this.badCounts = badCounts; 206 207 if(typeof this.getRate != "function"){ 208 UrlBadLink.prototype.getRate = function(){ 209 var output = Number(Math.round(this.badCounts/this.total*10000)/100).toFixed(2)+'%'; 210 return output; 211 }; 212 } 213 } 214 215 function filterUrl(arr,host){ 216 217 if(arr === null) 218 return null; 219 var output = []; 220 arr.forEach(function(item,index,array){ 221 //console.log(item); 222 var formatURL = URLFommat(item,host); 223 224 if(IsURL(formatURL)){ 225 output.push(formatURL); 226 }//if 227 });//forEach 228 229 return output; 230 }
第二個文件:將數據存庫,dbHandle.jsnode
/** * @author MissUU * @des MySql基本操做 * API: https://github.com/felixge/node-mysql */ var mysql = require('mysql'); mysql.createConnection('mysql://root:apple@localhost/test?debug=false'); var pool = mysql.createPool({ host : '10.102.1.00', user : 'root', password : 'root', database : 'test', connectionLimit: 15 }); //讀取urls exports.showUrls = function (groupId, callback){ console.log('this is showUrl()'); pool.getConnection(function(err, conn){ if (err) { console.log("connection error!"); console.log(err); } conn.query('SELECT id,realurl as url FROM t_site WHERE siteGroupId = ?',groupId, function(err, result){ if(err){ console.log(err.message); } conn.release(); if(result.length){ // console.log(result instanceof Array); callback(result); return true; }else{ callback(''); return false; } }); }); }; exports.updateBadLink = function (urlBadLink){ //若不含數據則不插入 if (!!urlBadLink) { pool.getConnection(function(err, conn){ if (err) { console.log("connection error!"); console.log(err); } var updateSql = "UPDATE a_qualityinfo SET brokenRate = '"+ urlBadLink.getRate() +"' WHERE siteId = " + urlBadLink.id; console.log(updateSql); conn.query(updateSql, function(err, result){ if(err){ console.log(err.message); console.log('update fail'); } conn.release(); console.log('update success'); });// conn.query });//pool.getConnection } };
代碼後期還會改動,這裏有幾點須要注意的:mysql
一、http.get有時會一直等待響應,因此必定要判斷下,超時則認爲出錯,要不程序就卡住了。。。= =!git
二、注意callback的使用,要否則很難規範執行順序的,用過nodejs的都懂得。。。github