剛入門node的cheerio模塊,寫了個特簡單的壁紙爬蟲玩玩javascript
話很少說,先上成品圖html
這裏用到了http://www.netbian.com/fengjingjava
也能夠爬妹紙圖片哦。node
這裏能夠看到,圖片區域放置於一個class爲list的區域內,圖片的img在a標籤中,咱們的目標就是獲取圖片的url和alt數組
首先函數
const cheerio = require("cheerio"); const fs = require("fs"); const request = require("request"); const iconv = require("iconv-lite");
這是個模塊我就很少說了,至於iconv的做用在我另外一篇文章裏有介紹ui
咱們先來實現獲取圖片信息,這裏用到了request模塊url
request.get({ encoding: null, url: imgUrl }, function (err, res, body) { if (!err && res.statusCode == 200) { let html = iconv.decode(body, "gb2312"); //console.log(html); $ = cheerio.load(html); let lis = $(".list ul li a img"); //console.log(lis); for (let i = 0; i < lis.length; i++) { if (lis[i].attribs && i !== 2) { urls.push(lis[i].attribs); } } } else { console.log(err); } });
這裏先用request獲取網頁的res.body信息,用cheerio.load轉換爲一個cheerio可操做的對象,後面的操做和jQuery是同樣的spa
這樣,咱們獲取的圖片信息就存儲在了urls數組中.net
下面加上下載圖片的代碼
request.get({ encoding: null, url: imgUrl }, function (err, res, body) { if (!err && res.statusCode == 200) { let html = iconv.decode(body, "gb2312"); //console.log(html); $ = cheerio.load(html); let lis = $(".list ul li a img"); //console.log(lis); for (let i = 0; i < lis.length; i++) { if (lis[i].attribs && i !== 2) { urls.push(lis[i].attribs); } } console.log(`開始下載...`); let startTime = new Date(); for (let i = 0; i < urls.length; i++) { request(urls[i].src, function (err, res, body) { if(err) { console.log(err); } }).pipe(fs.createWriteStream(`../壁紙/${urls[i].alt}.jpg`)); } let endTime = new Date(); console.log(`下載完成,共${urls.length}張圖片,耗時${endTime-startTime}ms`) } else { console.log(err); } });
用函數封裝起來,就是完整版啦
const cheerio = require("cheerio"); const fs = require("fs"); const request = require("request"); const iconv = require("iconv-lite"); let imgUrl = "http://www.netbian.com/fengjing/"; const urls = []; //獲取圖片的地址和名字到urls中 function getImg(page) { console.log(`正在獲取第${page}頁的圖片`) let path;//頁數/index_?.htm if (page === 1) { path = "index.htm"; } else if (typeof page !== "number" || page < 1) { console.log("請輸入正確的頁數"); return; } else { path = `index_${page}.htm`; } imgUrl += path; request.get({ encoding: null, url: imgUrl }, function (err, res, body) { if (!err && res.statusCode == 200) { let html = iconv.decode(body, "gb2312"); //console.log(html); $ = cheerio.load(html); let lis = $(".list ul li a img"); //console.log(lis); for (let i = 0; i < lis.length; i++) { if (lis[i].attribs && i !== 2) { urls.push(lis[i].attribs); } } console.log(`開始下載...`); let startTime = new Date(); for (let i = 0; i < urls.length; i++) { request(urls[i].src, function (err, res, body) { if(err) { console.log(err); } }).pipe(fs.createWriteStream(`../壁紙/${urls[i].alt}.jpg`)); } let endTime = new Date(); console.log(`下載完成,共${urls.length}張圖片,耗時${endTime-startTime}ms`) } else { console.log(err); } }); } getImg(4);
這個入門級的爬蟲功能還比較簡單,我還會慢慢完善。最近在嘗試一些複雜的爬蟲,但願能和你們多多交流,共同進步