這裏我是通過該博客的首頁裏面的列表進行的,判斷下一頁的標籤是否存在來把這個博客的文
章都爬出來;不是通過博客的文章分類開始爬的,雖然代碼差很少,可是我這個是簡單的;
複雜的我不會 ಥ_ಥ_javascript
var nextStep = $('.nextpostslink').attr('href'); console.log('nextStep--------------------------' + nextStep); if (nextStep) { getNext(nextStep, function(err, catList2) { if (!err) { return callback(false, null); }; callback(true, catList.concat(catList2));//則把爬到的數據合併; }) } else { callback(true, catList);//沒有下一頁則 返回 }
var http = require('http'); var cheerio = require('cheerio'); var fs = require('fs'); var async = require('async'); var max = 50; http.globalAgent.maxSockets = (max || 5); /** * [getNext 通過下一頁標籤判斷是否進行下一頁的獲取數據] * @param {[type]} url [鏈接] * @param {Function} callback [囘調函數] * */ function getNext(url, callback) { http.get(url, function(res) { var size = 0; var chunks = []; var nextPage; res.on('data', function(chunk) { size += chunk.length; chunks.push(chunk); }); res.on('end', function() { var data = Buffer.concat(chunks, size); var result = ''; $ = cheerio.load(data.toString()); topics = $('#content .post-title a'); topics.each(function() { var self = $(this); var item = { 'name': self.text(), 'href': self.attr('href') }; catList.push(item); }); var nextStep = $('.nextpostslink').attr('href'); console.log('nextStep--------------------------' + nextStep); if (nextStep) { getNext(nextStep, function(err, catList2) { if (!err) { return callback(false, null); }; callback(true, catList.concat(catList2)); }) } else { callback(true, catList); } }); }).on('error', function(e) { console.log(e.message); //callback(false, null); }); } var catList = []; //側邊的分類 var allList = []; //各分類的子分類 var infoPage = []; var liAll = ''; var i=0;//計算有多少條數據 getNext('http://devgirl.org/', function(err, list) { console.log('a 的長度為----------------》'+list.length); if (!err) return; async.eachSeries(list, function(li, next) { liAll += '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Document</title></head><body><li><a href="' + li.href + '">' + li.name + '</a></li></body></html>'; next(); }, function(err) { fs.writeFileSync('result.html', liAll); console.log('完成'); }); });