Node JS爬蟲：爬取瀑布流網頁高清圖

時間 2020-05-08

原文原文鏈接

靜態爲主的網頁每每用get方法就能獲取頁面全部內容。動態網頁即異步請求數據的網頁則須要用瀏覽器加載完成後再進行抓取。本文介紹瞭如何連續爬取瀑布流網頁。html

在知乎提到python就必有一大幫人提起爬蟲，咱Node JS爬蟲也是很是簡單的，和python相比僅僅是「異步」和「多線程」的性能對比而已。對python瞭解很少，故對此不作評價。java

phantomjs是一個‘無殼’的chrome，具體安裝方法查看phantomjs.org。phantomjs提供命令行工具運行，運行需使用命令phantom xxx.js。使用phantom-node這個庫能夠在Node Js中把玩phantomjs，這樣就能夠使用pm2進行進程守護和負載均衡了。node

目標

爬取200張以上的1920*1080分辨率的動漫壁紙，網頁是百度瀑布流圖片 python

方式

瀑布流是根據頁面滾動位置來判斷是否繼續往下加載，故要利用phantomjs滾動頁面來獲取更多圖片連接。單個圖片詳細頁面剛進入時是壓縮過的圖片，這是百度優化訪問速度的措施，等待幾秒圖片src就會替換成大圖的連接。所以，進入圖片詳細頁時應延遲幾秒再獲取圖片src，具體延遲幾秒視你網速而定。git

步驟

獲取連接

首先利用phantom打開網頁github

const phantom = require('phantom')

(async function() {
    const instance = await phantom.create();
    const page = await instance.createPage();
    const status = await page.open(url);
    const size = await page.property('viewportSize', {
        width: 1920,
        height: 1080
    })
}())

獲取連接數量，不足200則滾動網頁chrome

// 添加一個延時函數，等待頁面加載後再滾動
function delay(second) {
    return new Promise((resolve) => {
        setTimeout(resolve, second * 1000);
    });
}

async function pageScroll(i) {
    await delay(5)
    await page.property('scrollPosition', {
        left: 0,
        top: 1000 * i
    })
    let content = await page.property('content')
    let $ = cheerio.load(content)
    console.log($('.imgbox').length)
    if($('.imgbox').length < 200) {
        await pageScroll(++i)
    }
}
await pageScroll(0)

提取圖片連接瀏覽器

let urlList = []
$('.imgbox').each(function() {
    urlList.push('https://image.baidu.com'+$(this).find('a').attr('href'))
})

保存圖片

定義保存圖片的函數多線程

const request = require('request')
const fs = require('fs')

function save(url) {
    let ext = url.split('.').pop()
    request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
}

遍歷urlList，建議用遞歸遍歷，循環遍歷delay不起做用

async function imgSave(i) {
    let page = await page.open(urlList[i])
    delay(1)
    let content = await page.property('content')
    $ = cheerio.load(content)
    let src = $('#currentImg').attr('src')
    save(src)
    if(i<urlList.length) {
        await imgSave(++i)
    }
}
await imgSave(0)

最後爬取結果如圖，都是高分辨率的，部分圖片作了防爬處理

完整代碼

const phantom = require('phantom')
const cheerio = require('cheerio')
const request = require('request')
const fs = require('fs')
function delay(second) {
    return new Promise((resolve) => {
        setTimeout(resolve, second * 1000);
    });
}
let url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1'
function save(url) {
    let ext = url.split('.').pop()
    request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
}
(async function() {
    let instance = await phantom.create();
    let page = await instance.createPage();
    let status = await page.open(url);
    let size = await page.property('viewportSize', {
        width: 1920,
        height: 1080
    })
    let $
    async function pageScroll(i) {
        await delay(1)
        await page.property('scrollPosition', {
            left: 0,
            top: 1000 * i
        })
        let content = await page.property('content')
        $ = cheerio.load(content)
        if($('.imgbox').length < 200) {
            await pageScroll(++i)
        }
    }
    await pageScroll(0)
    let urlList = []
    $('.imgbox').each(function() {
        urlList.push('https://image.baidu.com'+$(this).find('a').attr('href'))
    })
    async function imgSave(i) {
        let status = await page.open(urlList[i])
        await delay(1)
        let content = await page.property('content')
        $ = cheerio.load(content)
        let src = $('#currentImg').attr('src')
        save(src)
        if(i<urlList.length) {
            await imgSave(++i)
        }
    }
    await imgSave(0)
    await instance.exit()
}());

個人博客：www.bougieblog.cn，歡迎前來尬聊。