本文簡單介紹一下如何用puppeteer抓取頁面數據。
npm install puppeteer --save-dev
npm install typescrip --save-dev
javascript
import { launch } from 'puppeteer'; async function maoyan_board_run() { let browser = await launch({ ignoreHTTPSErrors: true, headless: true, executablePath: 'D:\\wangxiao\\chrome-win\\chrome-win\\chrome.exe', args: ['--start-maximized'] }); const page = await browser.newPage(); await page.setViewport({width:1980,height:1080}); await page.goto('https://maoyan.com/board', { waitUntil: 'load' }); console.log(await page.title()); await browser.close(); } maoyan_board_run();
運行後,答應出當前頁面的title,分析一下這段代碼作什麼java
咱們先分析一下這個頁面,首先咱們發現熱門排行榜,電影名,主演,上映時間都是在一列一列的,那咱們是否是隻要獲取一個,其餘的都同樣都獲取到了git
const movie_bank = 'i[class*=board-index]';
根據頁面元素分析,要獲得標籤內的值($$eval用法不用說了,前面已經講過了)
、
github
const banks = await page.$$eval(movie_bank, list => list.map(n => n.innerHTML) );
其餘內容獲取方法依葫蘆畫瓢,完整代碼以下chrome
// 熱門口碑榜-名次 const movie_bank = 'i[class*=board-index]'; // 熱門口碑榜-名字 const movie_name = '.movie-item-info .name a'; // 熱門口碑榜-主演 const movie_star = '.movie-item-info .star'; // 熱門口碑榜-上映時間 const movie_releasetime = '.movie-item-info .releasetime'; // 熱門口碑榜-圖片 const board_lists_images = '.board-wrapper dd .image-link .board-img'; async function maoyan_board_run() { let browser = await launch({ ignoreHTTPSErrors: true, headless: true, executablePath: 'D:\\wangxiao\\chrome-win\\chrome-win\\chrome.exe', args: ['--start-maximized'] }); const page = await browser.newPage(); await page.setViewport({width:1980,height:1080}); await page.goto('https://maoyan.com/board', { waitUntil: 'load' }); // await autoScroll(page); const length = await page.evaluate( (movie_bank) => { return document.querySelectorAll(movie_bank).length; },movie_bank); const banks = await page.$$eval(movie_bank, list => list.map(n => n.innerHTML) ); const names = await page.$$eval(movie_name, list => list.map(n => n.getAttribute('title')) ); const stars = await page.$$eval(movie_star, list => list.map(n => n.innerHTML.replace(/\n/g,"").replace(/\s/g,"")) ); const releasetimes = await page.$$eval(movie_releasetime, list => list.map(n => n.innerHTML) ); let data = []; for (let i =0;i<length;i++) { data.push({ bank:banks[i], name:names[i], star:stars[i], releasetime:releasetimes[i] }) } await page.waitFor(10000); console.log(data); await browser.close(); } maoyan_board_run();