這一章主利用node的http模塊製做一個網頁的小爬蟲來爬去網頁信息,其中對於後端html的節點的獲取採用了cheerio模塊,這html
/** * Created by Administrator on 2016/9/16. */ var http = require('http'); var cheerio = require('cheerio'); var url = 'http://www.imooc.com/learn/348'; function filterChapters(html){ var $ = cheerio.load(html);// 要使用cheerio模塊先要用npm install cheerio加載進來,而後再前面引入(var cheerio = require('cheerio');) var chapters = $('.chapter'); var courseData = []; chapters.each(function(item){ var chapter = $(this); var chapterTitle = chapter.find('strong').text(); var videos = chapter.find('.video').children('li'); var chapterData = { chapterTitle:chapterTitle, videos:[] } videos.each(function(item){ var video = $(this).find('.J-media-item'); var videoTitle = video.text(); var id = video.attr('href').split('video/')[1]; chapterData.videos.push({ title:videoTitle, id:id, }) }) courseData.push(chapterData); }) return courseData; } function printCourseInfo(courseData){ courseData.forEach(function(item){ var chapterTitle = item.chapterTitle; console.log(chapterTitle ); item.videos.forEach(function(video){ console.log(video.id) //console.log(' 【'+ video.id + '】 '+ video.title + '\n'); }) }) } http.get(url,function(res){ var html = ''; res.on('data',function(data){ res會監聽data事件的發生 html += data; }); res.on('end',function(){ var courseData = filterChapters(html); printCourseInfo(courseData); }) }).on('error',function(){ console.log('獲取課程出錯!') })
個模塊能夠在後端獲取html頁面的元素node
,獲取方法相似於jqueryjquery
代碼以下npm