本腳本做用是抓取掌閱書城裏男頻女頻各分類的已完結書籍信息,按好評排序只抓前三頁。
這個頁面沒有任何反爬措施,適合做爲簡單例子。javascript
const fs = require("fs") const puppeteer = require('puppeteer'); const url = "http://www.ireader.com/index.php?ca=booksort.index&pca=booksort.index&pid=$pid&order=score&status=3&cid=$cid&page=$page" const pids = [10, 68]; // 男頻,女頻 const cids = [[11, 27, 19, 22, 16, 39, 42, 50, 54, 57, 60], [69, 74, 82, 86, 89, 90, 91, 723]]; // 頻道中的分類ID (async () => { const browser = await puppeteer.launch({ // 啓動chrome瀏覽器 // headless: false, // 是否無頭模式,能夠先在有頭模式下調試,無誤後切換成無頭模式以提高效率 ignoreDefaultArgs: ["--enable-automation"], // 去掉chrome啓動參數中的--enable-automation }); const page = await browser.newPage(); const f = () => { return Array.from($('.bookMation')).map(e => { const id = $('h3 a', e).attr('href').match(/bid=(\d+)/)[1] // 用正則提取連接中的bid const title = $('h3 a', e).text() const author = $('p.tryread', e).text().replace('試讀', '').trim() const desc = $('p.introduce', e).text() return {id, title, author, desc} }) } let result = []; for (const i in pids) { const pid = pids[i] for (cid of cids[i]) { for (let pg = 1; pg < 4; pg++) { // 只抓前三頁 const u = url.replace("$cid", cid).replace("$pid", pid).replace("$page", pg) await page.goto(u); const res = await page.evaluate(f) res.forEach(e => { e.cid = cid; e.pid = pid }) result = result.concat(res) console.log("page " + pg + " done") } console.log("cid " + cid + " done") } console.log("pid " + pid + " done") } fs.writeFileSync("d:/tmp/ireader_hot.json", JSON.stringify(result), {encoding: "utf-8"}) console.log("all done") await browser.close(); // 關閉瀏覽器 })();