一個用於爬取www.nvshens.com上妹子圖片的爬蟲。若有侵權,立刻關閉
html
一張張下實在太麻煩了
node
0. node -v >= 7.6 1. git clone https://github.com/laihaibo/beauty-spider.git 2. npm i 3. npm run start (爬取相冊圖片連接,並保存爲json) 4. npm run calc (獲取爬取的相冊數和文件數) 5. npm run download (下載圖片文件)
圖片下載完以後會發現變成了盜鏈圖片。因而觀察瀏覽器正常瀏覽行爲。在請求頭中設置referer
, accept
和user-agent
。解決該問題git
request.get(url).set({ 'Referer': 'https://www.google.com', 'Accept': 'image/webp,image/*,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3091.0 Safari/537.36' }).end((err, res) => {})
圖片下載700個文件時,常常斷線。應該是網站的飯爬蟲機制起了做用,暫時沒法解決。從新下載時理應跳過已經下載的文件。因而在保存圖片時會先判斷圖片是否存在。github
let isExit = fs.existsSync(path); if (!isExit) { saveOne(...args) }
let data = JSON.parse(fs.readFileSync(path)); let count = data.reduce((prev, cur) => prev + cur.imgList.length, 0); console.log(`共${data.length}個相冊,共${count}張圖片`);
引入所需的庫web
const fs = require("fs"); const mkdirp = require('mkdirp'); const cheerio = require('cheerio'); const request = require('superagent'); require('superagent-charset')(request);
頁面分析,配置config文件
分析相冊地址,以韓國
這個標籤爲例,首頁爲https://www.nvshens.com/gallery/hanguo/
, 第二頁爲https://www.nvshens.com/gallery/hanguo/2.html
npm
const config = { current: 'hanguo', allTags: { rougan: `https://www.nvshens.com/gallery/rougan/`, hanguo: 'https://www.nvshens.com/gallery/hanguo/' } }
封裝獲取指定url的html內容函數json
//該網站編碼爲utf-8 const getHtml = url => { return new Promise((resolve, reject) => { request.get(url).charset('utf-8').end((err, res) => { err ? reject(err) : resolve(cheerio.load(res.text)); }) }) }
獲取本分類下全部相冊的標籤數組
/** * @param {string} startUrl 標籤首頁的url地址 */ const getAlbums = (startUrl) => { return new Promise((resolve, reject) => { let albums = []; // 用於保存該標籤的全部相冊信息 let getQuery = async startUrl => { try { let $ = await getHtml(startUrl); let pages = $('#listdiv .pagesYY a').length; // 獲取頁數 for (let i = 1; i <= pages; i++) { let pageUrl = `${startUrl + i}.html` // 設置每頁的url let $ = await getHtml(pageUrl); // 動態設置pages的值 let compare = $('#listdiv .pagesYY a').map(function (i, el) { return parseInt($(this).text(), 0); }).get().filter(x => x > 0); pages = conmpare.length < 2 ? pages : compare.reduce((prev, cur) => Math.max(prev, cur)); $('.galleryli_title a').each(function () { albums.push({ title: $(this).text(), url: `https://www.nvshens.com${$(this).attr("href")}`, imgList: [], id: parseInt($(this).attr("href").split('/')[2], 10) }) }) } resolve(albums); // 返回相冊信息 } catch (error) { console.log(error); } } getQuery(startUrl); }) }
獲取全部相冊的圖片信息瀏覽器
/** * @param {string} startUrl 該相冊首頁的url地址 */ const getImgList = (startUrl) => { return new Promise((resolve, reject) => { let albums = []; // 存儲本相冊的全部圖片信息 let getQuery = async startUrl => { try { let $ = await getHtml(startUrl); let pages = $('#pages a').length; for (let i = 1; i <= pages; i++) { let pageUrl = `${startUrl + i}.html` let $ = await getHtml(pageUrl); $('#hgallery img').each(function () { let url = $(this).attr('src'); //圖片地址 let fileName = url.split('/').pop(); //文件名 let id = parseInt(fileName.split('.')[0], 10); //id albums.push({ url, fileName, id }) }) } resolve(albums); // 返回本相冊的全部圖片信息 } catch (error) { console.log(error); } } getQuery(startUrl); }) }
保存相冊信息async
/** * @param {string} path 保存數據的路徑 * @param {array} albums 相冊信息數組 */ const saveData = (path, albums) => { fs.writeFile(path, JSON.stringify(albums, null, ' '), function (err) { err ? console.log(err) : console.log('Data saved'); }); }
保存圖片
/** 12. @param {string} title 圖片所在文件夾名 13. @param {string} url 圖片url 14. @param {string} fileName 圖片名 15. @param {array} imgList 單個相冊的圖片信息 */ // 保存一張圖片 const saveOne = (title, url, fileName) => { return new Promise((resolve, reject) => { let path = `./img/${currentImgType}/${title}/${fileName}`; request.get(url).end((err, res) => { if (err) { console.log(`Error: ${err} in getting ${url}`) } fs.writeFile(path, res.body, function (err) { if (err) console.log(`Error: ${err} in downloading ${url}`) }); resolve(); }) }) } //保存一個相冊下的多張圖片 const saveImg = ({title,imgList}) => { // 建立文件夾 mkdirp(`./img/${currentImgType}/${title}`, function (err) { if (err) { console.log(`Error: ${err} in makedir ${title}`); } }); let getQuery = async() => { try { for (let {url,fileName} of imgList) { await saveOne(title, url, fileName); } } catch (error) { console.log(error); } } // 打印下載一個相冊所需時間 console.time(`download ${title}...`) getQuery(); console.timeEnd(`download ${title}...`) }
執行爬蟲
const doSpider = async() => { try { // 獲取相冊信息 let albums = await getAlbums(allTags[current]); // 獲取每張圖片信息 for (let album of albums) { let imgList = await getImgList(album.url); album.imgList = imgList; } // 保存json let jsonPath = `./data`; mkdirp(jsonPath, function (err) { if (err) { console.log(`Error: ${err} in makedir of Json`); } }); saveData(`${jsonPath}/${currentImgType}.json`, albums); // 保存圖片 for (let value of albums) { saveImg(value) } } catch (error) { console.log(error); } }
有些坑若是不踩過一遍是不會吐血的,好比cheerio的操做和fs的操做
just do it
本文有參考nieheyong
的HanhandeSpider和其餘的爬蟲文章,獲得不少啓發