bilibili壁紙站-node爬蟲

時間 2019-11-07

原文原文鏈接

前言

以前初學node的時候，有用爬蟲爬過一些磁力連接
詳情見羞羞的node爬蟲
可是沒有併發，沒有代理，那時也對異步不是很瞭解
因此此次又寫了個爬蟲，爬取bilibili壁紙站的全部壁紙
而且爬取開心代理的100條ip，並將有用的ip存進json文件中javascript

用到的模塊

async （控制併發）
cheerio （解析DOM）
superagent （http庫）
superagent-proxy （使用代理）
fs （讀寫文件）

其中cheerio, superagent的具體用法見我以前的羞羞的node爬蟲
不過以前初學，代碼寫得很難看就對了html

爬取代理ip

代理ip是幹嗎的前端

咱們訪問互聯網資源時，都是用咱們本身的ip（身份證）去訪問的
而爬蟲得頻繁地去獲取互聯網資源
所以若是你在某個時間點頻繁地訪問某網站的某資源
形成該網站的服務器壓力
就有可能被網站管理者禁ip，從而訪問不了該網站
代理ip就是僞造身份去訪問java

怎麼檢驗ip的可用性node

這裏面就使用到了 superagent 的一個拓展 superagent-proxy
而後用其去訪問http://ip.chinaz.com/getip.aspx
若 3s 內能返回值，則證實該 ip 可用git

const superagent = require('superagent')
require('superagent-proxy')(superagent);

// 寫上你先要測試的 ip，下面僅爲測試ip
let testIp = 'http://61.178.238.122:63000';

(async function() {
  superagent.get('http://ip.chinaz.com/getip.aspx').proxy(testIp).timeout(3000)
  .end((err, res) => {
    if(res === undefined) {
      console.log('掛了'); 
      return 
    }
    if(err) {
      console.log('報錯啦')
    }
    console.log('成功： ' + res.text)
  })
}())

爬取ip並存儲github

首先咱們先看下咱們要爬取的開心代理的DOM
web

咱們要爬取得ip地址放在tr 標籤的第一個td上
而且點擊第二頁時，連接變爲http://www.kxdaili.com/dailiip/1/2.html#ip
連接上的數組表示得是頁數，也就是說咱們只要改變連接上數字的值
就能夠獲取到其餘頁的htmlajax

代碼以下：json

const superagent = require('superagent')
const cheerio = require('cheerio')
const fs = require('fs')
const apiFunc = require('../common/apiFunc')  // 封裝的一些讀寫api

// 爬取開心代理的 ip
const website = 'http://www.kxdaili.com'
let url = website + '/dailiip/1/'

// 總執行函數
let getIp = async function() {
  // promise 存放的數組
  let tasks = []

  // 讀取 ip.js 自己存儲的ip
  let ips = await apiFunc.readFile('./ip.js')
  ips = JSON.parse(ips)

  for(let page = 1; page <= 10; page++) {
    let res = await superagent.get(url + page +'.html')
    let $ = cheerio.load(res.text)
    let tr = $('tbody>tr')

    for(let i = 0; i < tr.length; i++) {
      let td = $(tr[i]).children('td')
      let proxy = 'http://' + $(td[0]).text() + ':' + $(td[1]).text()
      let pro = apiFunc.filterIp(proxy)

      // 將全部的IP過濾Promise存入一個tasks數組中
      tasks.push(pro)
    }
  }

  // 使用 all 等待全部ip過濾完畢後執行 寫入 ip.js過程
  Promise.all(tasks).then((arr) => {
    // 過濾掉返回值爲 undefined 的數據
    let usefulIp = arr.filter((item) => {
      return (item !== undefined)
    })
    ips = JSON.stringify(ips.concat(usefulIp))
    console.log(ips)
    apiFunc.writeFile('./ip.js', ips)   
  })
}

getIp()

module.exports = getIp

爬取bilibili壁紙站

咱們先進入bilibili壁紙站

發現有一個點擊加載更多的按鈕
若是有對前端有了解的話，咱們應該知道這是經過 ajax 請求來異步獲取數據
所以咱們打開開發者的NetWork

果真在 XHR 這一欄發現了一個api
裏面返回的是存儲了當前頁面全部壁紙縮略圖信息的json文件
僅依靠這個json文件，咱們即可以爬取全部壁紙的縮略圖
可咱們要的但是高清大圖啊

因而咱們隨意點擊一張縮略圖

發現它的url的參數（il_id, width, height）都來自咱們以前獲取的json內的數據
也就是說咱們能夠拼接該連接來獲取到該高清圖片的連接，再利用cheerio來解析DOM獲取圖片地址就ok了
！！！
！！！
！！！
然而，哈哈哈哈哈哈哈哈哈哈哈哈
當咱們獲取到該網頁的html後，發現該<img>標籤內的src是空的
也就是說該<img>也是js賦值，因此下意識又去看了NetWork的XHR
果真發現了另外一個api

而高清圖片的url就是該api返回的json數據中的il_file

所以咱們只須要拼接該api連接，再用superagent請求就能夠獲取到高清圖片的url

理下思路

獲取縮略圖api返回的包含高清圖片數據的json
將1的json數據拼接到高清圖片api連接上，並將全部api連接存入數組
併發獲取2數組中的api，獲取全部的圖片url，並將url存入數組
併發下載數組中的圖片url，存進本地文件夾

結果在爬取bilibili壁紙站時，是不須要解析DOM的，也就是不須要使用cheerio模塊啦

代碼以下：

const superagent = require('superagent')
require('superagent-proxy')(superagent);
const fs = require('fs')
const cheerio = require('cheerio')
const async = require('async')

// 獲取bilibili API的json數據
let jsonUrl = 'http://h.bilibili.com/wallpaperApi?action=getOptions&page=1'
let proxy = "http://218.201.98.196:3128"

let getPicJson = function () {
  return new Promise((resolve, reject) => {
    superagent
      .get(jsonUrl)
      .proxy(proxy)
      .end((err, res) => {
        if (err) console.log('代理出錯啦')
        if (res === undefined) return
        if (res.statusCode == 200) {
          let json = JSON.parse(res.text)
          resolve(json)
        }
      })
  })
}

// 獲取高清圖片api的json數據
let dealHd = async function () {
  let picHd = []
  let picJson = await getPicJson()
  let picLength = picJson.length

  for (let i = 1; i < picLength; i++) {
    let item = {}
    // let width = picJson[i].detail[0].width
    // let height = picJson[i].detail[0].height
    let il_id = picJson[i].detail[0].il_id
    item.title = picJson[i].detail[0].title
    item.url = `http://h.bilibili.com/wallpaperApi?action=getDetail&il_id=${il_id}`
    picHd.push(item)
    // item.url = `http://h.bilibili.com/wallpaper?action=detail&il_id=${il_id}&type=Bilibili&width=${width}&height=${height}`
    // picHtmlJson.push(item)
  }
  return picHd
}

// 獲取高清圖片的url ===== queue
let dealPicJson = async function () {

  console.log('獲取高清圖片url，開始執行....')
  var concurrencyCount = 0;
  let result = []
  let hdJson = await dealHd()
  return new Promise((resolve, reject) => {

    let q = async.queue((hDJson, callback) => {
      var delay = parseInt((Math.random() * 30000000) % 1000, 10);  //設置延時併發爬取
      concurrencyCount++;
      console.log('如今的併發數是', concurrencyCount, '，正在獲取的是', hDJson.title, '延遲', delay, '毫秒');

      superagent.get(hDJson.url).proxy(proxy).end((err, res) => {
        if (err) {
          console.log(err);
          callback(null);
        } else {
          // let $ = cheerio.load(res.text)
          // let hdUrl = $('#wallpaper').attr('id')
          // console.log('連接是' + hdUrl)
          let pic = {}
          pic.title = hDJson.title
          pic.url = res.body[0].detail[0].il_file
          pic.format = pic.url.match(/.{3}$/)[0]
          // console.log(result)

          result.push(pic)
          concurrencyCount --
          callback(null)
        }
      })
    }, 5)
    q.drain = function () {
      resolve(result)
    }

    q.push(hdJson)
  })
}


// 下載HD圖片
let downloadImg = async function () {
  console.log('開始下載圖片...');
  // let folder = `Data/img-${Config.currentImgType}-${Config.startPage}-${Config.endPage}`;
  // fs.mkdirSync(folder);
  let downloadCount = 0;
  var concurrencyCount = 0;
  let q = async.queue(function (image, callback) {
    // console.log('正在下載 : ' + image.title);
    var delay = parseInt((Math.random() * 30000000) % 1000, 10);  //設置延時併發爬取
    concurrencyCount++;
    console.log('如今的併發數是', concurrencyCount, '，正在抓取的是', image.title, '延遲', delay, '毫秒');
    superagent.get(image.url).proxy(proxy).end(function (err, res) {
      if (err) {
        console.log(err);
        callback(null);
      } else {
        downloadCount++;
        fs.writeFile(`./picture/${downloadCount}-${image.title}.${image.format}`, res.body, function (err) {
          if (err) {
            console.log(err);
          } else {
            console.log("圖片下載成功");
          }
          setTimeout(() => {
            concurrencyCount--;
            callback(null);
          }, delay)
        });
      }
    });
  }, 5);
  
  // 當全部任務都執行完之後，將調用該函數
  q.drain = function () {
    console.log('All img download');
  }
  let imgList = await dealPicJson();
  q.push(imgList);//將全部任務加入隊列
}

downloadImg()

async控制併發

控制併發我一般是用async.maplimit，由於最先接觸
不過看到一篇文章介紹了async.queue，我就試了下
區別在於， mapLimit會返回全部併發任務結束後的結果數組
而queue是沒有的，所以要本身定個變量來存放每個併發任務返回的結果
具體api用法見： async經常使用api

運行結果

後記

github代碼： bilibili壁紙站爬蟲
裏面有一些必要註釋
有4個能夠跑的js

./aboutIp/getIp.js （用來抓並存有用的代理ip）
./aboutIp/ipTest.js （測試ip可不可用）
app-thumbnails.js （用來爬壁紙的縮略圖）
app-hd.js （用來爬壁紙的高清圖）

雖然懂得很淺，但能漸漸感覺到爬蟲的魅力了?

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。