在Puppeteer的入門教程和實踐一文章,其中介紹了Puppeteer的幾種使用方式,分別是網頁截圖,生成頁面的PDF,分析當前頁的腳本,寫爬蟲等,本文主要介紹瞭如何利用Puppeteer實現抓取淘寶特定商品的信息的過程。html
廢話很少說,直接上代碼。數據庫
//taobaoCrepper.js const puppeteer = require('puppeteer'); const chalk = require('chalk'); let url = 'https://detail.tmall.com/item.htm?spm=a1z10.1-b-s.w19370147-18846483048.2.5ccf70bfVlqaMZ&id=568960237310&sku_properties=5919063:1173069333&scene=taobao_shop'; //要抓取的商品連接 let main = async function(){ let browser = await puppeteer.launch({ headless:false, devtools:true }); let page = await browser.newPage(); await page.goto(url,{ waitUntil:'networkidle0' }); console.log(chalk.green('進入寶貝詳情頁')); // await page.setViewport({ // width:1920, // height:1080 // }); let obj ={ title:'', colors:[], price:0, suit:[], skus:[], detailImgs:[], evaList:[] }; try{ //獲取模態窗口 let modalArray = await page.$$('#sufei-dialog-close'); //判斷模態窗口是否存在,存在就關閉 if(modalArray.length){ await page.click('#sufei-dialog-close'); } await page.waitFor(15000); obj.title= await page.$eval('.tb-detail-hd > h1 >a',ele =>ele.text) ; obj.price = await page.$eval('.tm-price',ele => ele.innerHTML); obj.colors = await page.evaluate(()=>{ let as = [...document.querySelectorAll('li.tb-txt a > span')]; return as.map((a)=>{ return a.innerHTML; }); }); obj.suit = await page.evaluate(()=>{ let as = [...document.querySelectorAll('li.tb-selected > a > span')]; return as.map((a)=>{ return a.innerHTML; }); }); obj.skus = await page.evaluate(()=>{ let as = [...document.querySelectorAll('ul#J_UlThumb >li> a > img')]; return as.map((a)=>{ return a.src; }); }); obj.detailImgs = await page.evaluate(()=>{ let as = [...document.querySelectorAll('div.ke-post > p > img')]; return as.map((a)=>{ return a.src; }); }); await page.click('a[href="#J_Reviews"] '); await page.waitFor(15000); obj.evaList = await page.evaluate(()=>{ console.log('進入評價分析抓取方法'); let evaList = [...document.querySelectorAll('div.tm-rate-content>div.tm-rate-fulltxt')]; console.log(evaList); return evaList.map((a)=>{ return a.textContent; }); }); }catch(err){ console.log(chalk.red(err)); } }; main();
(1) 跳轉到淘寶商品信息頁面時會彈出模態窗口,有時不會,須要作一個判斷。
(2)跳轉到商品評價列表後要設置延時,等DOM加載完畢後再去獲取對應內容。
(3)在抓取評價列表詳情時,選擇器不能之寫成「div.tm-rate-fulltxt」,不然後把賣家回覆的內容也抓取下來。
(4)本文只是把特定內容用JS爬取出來,並不涉及寫入數據庫的操做,待我學完再來更新呀555。less