#!/usr/local/bin/node /** * @author: vanishcode * @desc: hahahaha, fuck you, ADs! */ const puppeteer = require('puppeteer'); const axios = require('axios'); const fs = require('fs'); var currentNumber = 1; async function run(url) { console.log('Start to crawl girl\'s pivtures...'); const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); let imgURL = await page.evaluate(() => { let imgURL = [] let selector = 'a.view_img_link'; let imgUrlList = [...document.querySelectorAll(selector)]; imgUrlList.forEach(e => { imgURL.push(e.href) }) return imgURL }); //console.log(imgURL); imgURL.forEach((e, i) => { //console.log(e) if (currentNumber === 200) { browser.close(); console.log('All pictures downloaded complete!') return } axios.get(e, { responseType: 'stream' }).then(res => { res.data.pipe(fs.createWriteStream(`./meizi/${currentNumber}.${e.substr(e.length-3)}`)); currentNumber++; }) }); let nextPage = await page.evaluate(() => { return document.querySelectorAll('#comments > div:nth-child(4) > div > a.previous-comment-page')[0].href; }) console.log('OK!'); setTimeout(function() { run(nextPage) }, 3000); } run('http://jandan.net/ooxx');
較爲通用的小爬蟲,其實妹子圖網站就是一種結構,須要的信息是選擇器以及下一頁的地址。node
本項目使用了puppeteer,確實十分強大,我的感受headless差很少是chromium暴露接口後的一種產物,在macOS上其實有一點小瑕疵,在啓動時候其實圖標會顯出來而後立刻就消失了,本身能夠用定時器看一下。ios
代碼是比較水的,重要的是圖,是吧。。。。。。。。。git