npm install -g cheerio
npm install -g cheerio --save-dev
spider.js
html
var http = require('http'); var cheerio = require('cheerio'); var url = 'http://www.imooc.com/learn/51'; //本實例是爬一個慕課網課程列表 //過濾文本內容 function filterChapters(html){ var $ = cheerio.load(html); var chapters = $('.chapter'); var courseData = []; chapters.each(function(item){ var chapter = $(this); var chapterTitle = chapter.find('strong').contents().filter(function() { return this.nodeType === 3; }).text().trim(); var videos = chapter.find('.video').children('li'); var chapterData = { chapterTitle: chapterTitle, videos: [] } videos.each(function(item){ var video = $(this).find('.J-media-item'); var temp=video.text().trim(); var arr = temp.split('\n'); // 多層標籤的文本都拼到一塊兒了,要拆開,取用須要的值 var videoTitle = arr[0].trim() + ' ' +arr[1].trim(); var id=video.attr('href').split('video/')[1].trim(); chapterData.videos.push({ title: videoTitle, id: id }) }) courseData.push(chapterData) }) return courseData; } //打印爬蟲結果 function printCourseInfo(courseData){ courseData.forEach(function(item){ var chapterTitle = item.chapterTitle; console.log(chapterTitle+'\n'+'\n'); item.videos.forEach(function (video) { console.log(' 【' + video.id + '】 ' + video.title.trim() +'\n'); }) }) } //獲取網站文本內容 http.get(url, function(res){ var html = ''; res.on('data', function(data){ html += data; }) res.on('end', function(){ var courseData = filterChapters(html); printCourseInfo(courseData); }) }).on('error', function(){ console.log('出現錯誤!'); })
node spider.js
,便可看到爬蟲結果