nodejs爬蟲實踐-入門版

需求場景:
當你想分析微信生態內什麼產品最受歡迎的時候
當你想參考他們的點子與設計的時候...html

你須要數據,你須要爬蟲ios

工具庫

分析目標頁面 Dom 結構

獲取 json 格式數據npm

let res = await axios.get(url)
let html = res.data
let items = []
// 解析html
let $ = cheerio.load(html)
$(config.dom).each((idx, element) => {
  let $link = $(element)
  items.push({
    title: $link.attr('title').replace(/\//g, '-'),
    href: $link.attr('href'),
    desc: $link.find('.desc').text(),
    count: $link.find('.opened i').text()
  })
})
console.log({
  total: items.length,
  items
})
複製代碼

結果以下

{
  total: 70,
  items: [
    {
      title: '微報名',
      href: 'https://weixiao.qq.com/store/details/10007',
      desc: '在線收集報名信息,讓校園活動更便捷',
      count: '43968'
    },
    {
      title: '微上牆',
      href: 'https://weixiao.qq.com/store/details/10008',
      desc: '線下活動中,經過大屏幕同步展現現場觀衆發送的微信消息',
      count: '34967'
    },
    ...
  ]
}
複製代碼

項目所屬圖片資源獲取

注意圖片地址有多種json

// /img/home/apps/preview/apply_01.jpg
// https://weixiao.qq.com/img/home/apps/preview/apply_01.jpg
// http://p.qpic.cn/weixiao/0/1481391605/640
複製代碼

下載單張圖片資源

// 下載單張圖片資源
const downloadImage = async (imageSrc, fileName) => {
  const url = /^http(s?):\/\//.test(imageSrc) ? imageSrc : `https:${imageSrc}`
  const res = await axios({
    url,
    method: 'get',
    responseType: 'stream'
  })
  // console.log(res.data)
  res.data.pipe(fs.createWriteStream(fileName))
}
複製代碼

下載單個相冊圖片資源

// 下載相冊圖片
const downloadPhoto = async item => {
  const url = item.href
  const res = await axios.get(url)
  const html = res.data
  // 暫存圖片資源地址
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $('.preview .preview-img').each((idx, element) => {
    const $img = $(element)
    const type = 'png'
    let src = $img.attr('src')
    !/^http(s?):\/\//.test(src) && (src = `https://weixiao.qq.com${src}`)
    items.push({
      src,
      type
    })
  })
  console.log({
    total: items.length,
    items
  })
  // 建立子目錄
  let folderPath = path.resolve(__dirname, `${baseDir}/${item.title}`)
  mkdirSync(folderPath)
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    await downloadImage(item.src, `${folderPath}/${i + 1}.${item.type}`)
    console.log(`已下載:${item.title}-${i + 1}`)
  }
}
複製代碼

執行代碼以下

逐個資源下載,能夠優化爲 Promise.all 統一下載axios

逐個下載

const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const path = require('path')

const config = {
  url: 'https://weixiao.qq.com', // 目標域名
  route: 'store/labels?tag=0&order=2', // 目標具體地址
  dom: '.app-list a.app-item' // 處理元素選擇器
}

const baseDir = `./qq-app-download`
const savePath = path.resolve(__dirname, baseDir)

// 建立文件夾
const mkdirSync = dirPathStr => {
  if (!fs.existsSync(dirPathStr)) {
    fs.mkdirSync(dirPathStr)
    console.log(`文件夾已生成:${dirPathStr}`)
  } else {
    console.log(`文件夾已存在:${dirPathStr}`)
  }
}

mkdirSync(savePath)

// 下載圖片
const downloadImage = async (imageSrc, fileName) => {
  const url = /^http(s?):\/\//.test(imageSrc) ? imageSrc : `https:${imageSrc}`
  const res = await axios({
    method: 'get',
    url,
    responseType: 'stream'
  })
  // console.log(res.data)
  res.data.pipe(fs.createWriteStream(fileName))
}

// 下載相冊
const downloadPhoto = async item => {
  const url = item.href
  const title = item.title
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $('.preview .preview-img').each((idx, element) => {
    const $img = $(element)
    const type = 'png'
    let src = $img.attr('src')
    !/^http(s?):\/\//.test(src) && (src = `https://weixiao.qq.com${src}`)
    items.push({
      src,
      type
    })
  })
  // console.log(items)
  // 存放圖片子目錄
  const folderPath = path.resolve(__dirname, `${baseDir}/${item.title}`)
  mkdirSync(folderPath)
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    await downloadImage(item.src, `${folderPath}/${i + 1}.${item.type}`)
    console.log(`[${title}] - ${i + 1} 下載完成`)
  }
}

// 下載本頁面的全部相冊
const downloadImgList = async items => {
  // for (let index = 0; index < items.length; index++) {
  for (let index = 0; index < 3; index++) {
    const item = items[index]
    // 下載相冊
    await downloadPhoto(item)
    console.log(`相冊 [${item.title}] 下載完成`)
  }
}

// 入口函數
const main = async () => {
  const st = new Date().getTime()
  const url = `${config.url}/${config.route}`
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $(config.dom).each((idx, element) => {
    const $link = $(element)
    items.push({
      title: $link.attr('title').replace(/\//g, '-'),
      href: $link.attr('href'),
      desc: $link.find('.desc').text(),
      count: $link.find('.opened i').text()
    })
  })
  console.log({
    total: items.length,
    items
  })
  await downloadImgList(items)
  const et = new Date().getTime()
  console.log(`總耗時=>${(et - st) / 1000}s`)
}

main()
複製代碼

統一下載

const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const path = require('path')

const config = {
  url: 'https://weixiao.qq.com', // 目標域名
  route: 'store/labels?tag=0&order=2', // 目標具體地址
  dom: '.app-list a.app-item' // 處理元素選擇器
}

const baseDir = `./qq-app-download`
const savePath = path.resolve(__dirname, baseDir)

const mkdirSync = dirPathStr => {
  if (!fs.existsSync(dirPathStr)) {
    fs.mkdirSync(dirPathStr)
    console.log(`文件夾已 生成 :${dirPathStr}`)
  } else {
    console.log(`文件夾已 存在 :${dirPathStr}`)
  }
}

mkdirSync(savePath)

// 封裝taskPromiseAll
const taskPromiseAll = async arr => {
  return new Promise((resolve, reject) => {
    Promise.all(arr)
      .then(res => {
        resolve()
      })
      .catch(error => {
        reject(error)
      })
  })
}

// 下載圖片
const downloadImage = async (imageSrc, fileName) => {
  let url = /^http(s?):\/\//.test(imageSrc) ? imageSrc : `https:${imageSrc}`
  let res = await axios({
    url,
    method: 'get',
    responseType: 'stream'
  })
  res.data.pipe(fs.createWriteStream(fileName))
}

// 下載相冊
const downloadPhoto = async item => {
  const url = item.href
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $('.preview .preview-img').each((idx, element) => {
    const $img = $(element)
    const type = 'png'
    let src = $img.attr('src')
    !/^http(s?):\/\//.test(src) && (src = `https://weixiao.qq.com${src}`)
    items.push({
      src,
      type
    })
  })
  // console.log(items)
  // 存放圖片子目錄
  const folderPath = path.resolve(__dirname, `${baseDir}/${item.title}`)
  mkdirSync(folderPath)
  const arr = items.map((item, i) =>
    downloadImage(item.src, `${folderPath}/${i + 1}.${item.type}`)
  )
  await taskPromiseAll(arr)
}

// 下載本頁面的全部相冊
const downloadImgList = async items => {
  const arr = items.map(item => downloadPhoto(item))
  const res = await taskPromiseAll(arr)
  // console.log(res)
}

// 入口函數
const main = async () => {
  const st = new Date().getTime()
  const url = `${config.url}/${config.route}`
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $(config.dom).each((idx, element) => {
    const $link = $(element)
    items.push({
      title: $link.attr('title').replace(/\//g, '-'),
      href: $link.attr('href'),
      desc: $link.find('.desc').text(),
      count: $link.find('.opened i').text()
    })
  })
  console.log({
    total: items.length,
    items
  })
  await downloadImgList(items)
  console.log(`耗時=>${(new Date().getTime() - st) / 1000}`)
}

main()
複製代碼

優化

資源統一下載微信

優化後運行對好比下(下載3個相冊)

按文件下載 按相冊下載 統一下載
11.5s 8.5s 3.8s
相關文章
相關標籤/搜索