最近在學習nodejs爬蟲技術,學了request模塊,因此想着寫一個本身的爬蟲項目,研究了半天,最後選定indeed做爲目標網站,經過爬取indeed的職位數據,而後開發一個本身的職位搜索引擎,目前已經上線了,雖然功能仍是比較簡單,但仍是貼一下網址job search engine,證實一下這個爬蟲項目是有用的。下面就來說講整個爬蟲的思路。 html
衆所周知,爬蟲是須要入口頁面的,經過入口頁面,不斷的爬取連接,最後爬取完整個網站。在這個第一步的時候,就遇到了困難,通常來講都是選取首頁和列表頁做爲入口頁面的,可是indeed的列表頁面作了限制,不能爬取完整的列表,頂多只能抓取前100頁,可是這沒有難倒我,我發現indeed有一個Browse Jobs 頁面,經過這個頁面,能夠獲取indeed按地區搜索和按類型搜索的全部列表。下面貼一下這個頁面的解析代碼。node
start: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false }); $('#states > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href')); tasks.push({ _id: md5(url), type: 'city', host, url, done: 0, name: $(ele).text() }); }); $('#categories > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href')); tasks.push({ _id: md5(url), type: 'category', host, url, done: 0, name: $(ele).text() }); }); const res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-start insert ${res.insertedCount} from ${tasks.length} tasks`); return 1; } catch (err) { console.error(`${host}-start parse ${page.url} ${err}`); return 0; } }
經過cheerio解析html內容,把按地區搜索和按類型搜索連接插入到數據庫中。mongodb
這裏簡單講一下個人爬蟲架構思路,數據庫選用mongodb。每個待爬取的頁面存一條記錄page,包含id,url,done,type,host等字段,id用md5(url)
生成,避免重複。每個type有一個對應的html內容解析方法,主要的業務邏輯都集中在這些解析方法裏面,上面貼出來的代碼就是例子。數據庫
爬取html採用request模塊,進行了簡單的封裝,把callback封裝成promise,方便使用async和await方式調用,代碼以下。json
const req = require('request'); const request = req.defaults({ headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' }, timeout: 30000, encoding: null }); const fetch = (url) => new Promise((resolve) => { console.log(`down ${url} started`); request(encodeURI(url), (err, res, body) => { if (res && res.statusCode === 200) { console.log(`down ${url} 200`); resolve(body); } else { console.error(`down ${url} ${res && res.statusCode} ${err}`); if (res && res.statusCode) { resolve(res.statusCode); } else { // ESOCKETTIMEOUT 超時錯誤返回600 resolve(600); } } }); });
作了簡單的反反爬處理,把user-agent改爲電腦通用的user-agent,設置了超時時間30秒,其中encoding: null
設置request直接返回buffer,而不是解析後的內容,這樣的好處是若是頁面是gbk或者utf-8編碼,只要解析html的時候指定編碼就好了,若是這裏指定encoding: utf-8
,則當頁面編碼是gbk的時候,頁面內容會亂碼。promise
request默認是回調函數形式,經過promise封裝,若是成功,則返回頁面內容的buffer,若是失敗,則返回錯誤狀態碼,若是超時,則返回600,這些懂nodejs的應該很好理解。服務器
const URL = require('url'); const md5 = require('md5'); const cheerio = require('cheerio'); const iconv = require('iconv-lite'); const json = (data) => { let res; try { res = JSON.parse(data); } catch (err) { console.error(err); } return res; }; const rules = [ /\/jobs\?q=.*&sort=date&start=\d+/, /\/jobs\?q=&l=.*&sort=date&start=\d+/ ]; const fns = { start: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false }); $('#states > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href')); tasks.push({ _id: md5(url), type: 'city', host, url, done: 0, name: $(ele).text() }); }); $('#categories > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href')); tasks.push({ _id: md5(url), type: 'category', host, url, done: 0, name: $(ele).text() }); }); const res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-start insert ${res.insertedCount} from ${tasks.length} tasks`); return 1; } catch (err) { console.error(`${host}-start parse ${page.url} ${err}`); return 0; } }, city: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const cities = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false }); $('#cities > tbody > tr > td > p.city > a').each((i, ele) => { // https://www.indeed.com/l-Charlotte,-NC-jobs.html let tmp = $(ele).attr('href').match(/l-(?<loc>.*)-jobs.html/u); if (!tmp) { tmp = $(ele).attr('href').match(/l=(?<loc>.*)/u); } const { loc } = tmp.groups; const url = `https://www.indeed.com/jobs?l=${decodeURIComponent(loc)}&sort=date`; tasks.push({ _id: md5(url), type: 'search', host, url, done: 0 }); cities.push({ _id: `${$(ele).text()}_${page.name}`, parent: page.name, name: $(ele).text(), url }); }); let res = await global.com.city.insertMany(cities, { ordered: false }).catch(() => {}); res && console.log(`${host}-city insert ${res.insertedCount} from ${cities.length} cities`); res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-city insert ${res.insertedCount} from ${tasks.length} tasks`); return 1; } catch (err) { console.error(`${host}-city parse ${page.url} ${err}`); return 0; } }, category: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const categories = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false }); $('#titles > tbody > tr > td > p.job > a').each((i, ele) => { const { query } = $(ele).attr('href').match(/q-(?<query>.*)-jobs.html/u).groups; const url = `https://www.indeed.com/jobs?q=${decodeURIComponent(query)}&sort=date`; tasks.push({ _id: md5(url), type: 'search', host, url, done: 0 }); categories.push({ _id: `${$(ele).text()}_${page.name}`, parent: page.name, name: $(ele).text(), url }); }); let res = await global.com.category.insertMany(categories, { ordered: false }).catch(() => {}); res && console.log(`${host}-category insert ${res.insertedCount} from ${categories.length} categories`); res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-category insert ${res.insertedCount} from ${tasks.length} tasks`); return 1; } catch (err) { console.error(`${host}-category parse ${page.url} ${err}`); return 0; } }, search: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const durls = []; try { const con = iconv.decode(page.con, 'utf-8'); const $ = cheerio.load(con, { decodeEntities: false }); const list = con.match(/jobmap\[\d+\]= {.*}/g); const jobmap = []; if (list) { // eslint-disable-next-line no-eval list.map((item) => eval(item)); } for (const item of jobmap) { const cmplink = URL.resolve(page.url, item.cmplnk); const { query } = URL.parse(cmplink, true); let name; if (query.q) { // eslint-disable-next-line prefer-destructuring name = query.q.split(' #')[0].split('#')[0]; } else { const tmp = cmplink.match(/q-(?<text>.*)-jobs.html/u); if (!tmp) { // eslint-disable-next-line no-continue continue; } const { text } = tmp.groups; // eslint-disable-next-line prefer-destructuring name = text.replace(/-/g, ' ').split(' #')[0]; } const surl = `https://www.indeed.com/cmp/_cs/cmpauto?q=${name}&n=10&returnlogourls=1&returncmppageurls=1&caret=8`; const burl = `https://www.indeed.com/viewjob?jk=${item.jk}&from=vjs&vjs=1`; const durl = `https://www.indeed.com/rpc/jobdescs?jks=${item.jk}`; tasks.push({ _id: md5(surl), type: 'suggest', host, url: surl, done: 0 }); tasks.push({ _id: md5(burl), type: 'brief', host, url: burl, done: 0 }); durls.push({ _id: md5(durl), type: 'detail', host, url: durl, done: 0 }); } $('a[href]').each((i, ele) => { const tmp = URL.resolve(page.url, $(ele).attr('href')); const [url] = tmp.split('#'); const { path, hostname } = URL.parse(url); for (const rule of rules) { if (rule.test(path)) { if (hostname == host) { // tasks.push({ _id: md5(url), type: 'list', host, url: decodeURI(url), done: 0 }); } break; } } }); let res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-search insert ${res.insertedCount} from ${tasks.length} tasks`); res = await global.com.task.insertMany(durls, { ordered: false }).catch(() => {}); res && console.log(`${host}-search insert ${res.insertedCount} from ${durls.length} tasks`); return 1; } catch (err) { console.error(`${host}-search parse ${page.url} ${err}`); return 0; } }, suggest: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const companies = []; try { const con = page.con.toString('utf-8'); const data = json(con); for (const item of data) { const id = item.overviewUrl.replace('/cmp/', ''); const cmpurl = `https://www.indeed.com/cmp/${id}`; const joburl = `https://www.indeed.com/cmp/${id}/jobs?clearPrefilter=1`; tasks.push({ _id: md5(cmpurl), type: 'company', host, url: cmpurl, done: 0 }); tasks.push({ _id: md5(joburl), type: 'jobs', host, url: joburl, done: 0 }); companies.push({ _id: id, name: item.name, url: cmpurl }); } let res = await global.com.company.insertMany(companies, { ordered: false }).catch(() => {}); res && console.log(`${host}-suggest insert ${res.insertedCount} from ${companies.length} companies`); res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-suggest insert ${res.insertedCount} from ${tasks.length} tasks`); return 1; } catch (err) { console.error(`${host}-suggest parse ${page.url} ${err}`); return 0; } }, // list: () => {}, jobs: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const durls = []; try { const con = iconv.decode(page.con, 'utf-8'); const tmp = con.match(/window._initialData=(?<text>.*);<\/script><script>window._sentryData/u); let data; if (tmp) { const { text } = tmp.groups; data = json(text); if (data.jobList && data.jobList.pagination && data.jobList.pagination.paginationLinks) { for (const item of data.jobList.pagination.paginationLinks) { // eslint-disable-next-line max-depth if (item.href) { item.href = item.href.replace(/\u002F/g, '/'); const url = URL.resolve(page.url, decodeURI(item.href)); tasks.push({ _id: md5(url), type: 'jobs', host, url: decodeURI(url), done: 0 }); } } } if (data.jobList && data.jobList.jobs) { for (const job of data.jobList.jobs) { const burl = `https://www.indeed.com/viewjob?jk=${job.jobKey}&from=vjs&vjs=1`; const durl = `https://www.indeed.com/rpc/jobdescs?jks=${job.jobKey}`; tasks.push({ _id: md5(burl), type: 'brief', host, url: burl, done: 0 }); durls.push({ _id: md5(durl), type: 'detail', host, url: durl, done: 0 }); } } } else { console.log(`${host}-jobs ${page.url} has no _initialData`); } let res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-search insert ${res.insertedCount} from ${tasks.length} tasks`); res = await global.com.task.insertMany(durls, { ordered: false }).catch(() => {}); res && console.log(`${host}-search insert ${res.insertedCount} from ${durls.length} tasks`); return 1; } catch (err) { console.error(`${host}-jobs parse ${page.url} ${err}`); return 0; } }, brief: async (page) => { const host = URL.parse(page.url).hostname; try { const con = page.con.toString('utf-8'); const data = json(con); data.done = 0; data.views = 0; data.host = host; // format publish date if (data.vfvm && data.vfvm.jobAgeRelative) { const str = data.vfvm.jobAgeRelative; const tmp = str.split(' '); const [first, second] = tmp; if (first == 'Just' || first == 'Today') { data.publishDate = Date.now(); } else { const num = first.replace(/\+/, ''); if (second == 'hours') { const date = new Date(); const time = date.getTime(); // eslint-disable-next-line no-mixed-operators date.setTime(time - num * 60 * 60 * 1000); data.publishDate = date.getTime(); } else if (second == 'days') { const date = new Date(); const time = date.getTime(); // eslint-disable-next-line no-mixed-operators date.setTime(time - num * 24 * 60 * 60 * 1000); data.publishDate = date.getTime(); } else { data.publishDate = Date.now(); } } } await global.com.job.updateOne({ _id: data.jobKey }, { $set: data }, { upsert: true }).catch(() => { }); const tasks = []; const url = `https://www.indeed.com/jobs?l=${data.jobLocationModel.jobLocation}&sort=date`; tasks.push({ _id: md5(url), type: 'search', host, url, done: 0 }); const res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {}); res && console.log(`${host}-brief insert ${res.insertedCount} from ${tasks.length} tasks`); return 1; } catch (err) { console.error(`${host}-brief parse ${page.url} ${err}`); return 0; } }, detail: async (page) => { const host = URL.parse(page.url).hostname; try { const con = page.con.toString('utf-8'); const data = json(con); const [jobKey] = Object.keys(data); await global.com.job.updateOne({ _id: jobKey }, { $set: { content: data[jobKey], done: 1 } }).catch(() => { }); return 1; } catch (err) { console.error(`${host}-detail parse ${page.url} ${err}`); return 0; } }, run: (page) => { if (page.type == 'list') { page.type = 'search'; } const fn = fns[page.type]; if (fn) { return fn(page); } console.error(`${page.url} parser not found`); return 0; } }; module.exports = fns;
每個解析方法都會插入一些新的連接,新的連接記錄都會有一個type字段,經過type字段,能夠知道新的連接的解析方法,這樣就能完整解析全部的頁面了。例如start方法會插入type爲city和category的記錄,type爲city的頁面記錄的解析方法就是city
方法,city方法裏面又會插入type爲search的連接,這樣一直循環,直到最後的brief和detail方法分別獲取職位數據的簡介和詳細內容。架構
其實爬蟲最關鍵的就是這些html解析方法,有了這些方法,你就能獲取任何想要的結構化內容了。async
這部分就很簡單了,有了前面獲取的結構化數據,按照elasticsearch,新建一個schema,而後寫個程序定時把職位數據添加到es的索引裏面就好了。由於職位詳情的內容有點多,我就沒有把content字段添加到索引裏面了,由於太佔內存了,服務器內存不夠用了,>_<。elasticsearch
最後仍是貼上網址供你們檢閱,job search engine。