上個月寫了一篇《個人大前端之旅》,裏面介紹了一下我對大前端時代到來的一點我的觀點。簡單來講,我更喜歡把本身的將來規劃成一專多能的工程師,畢竟技多不壓身,在深刻研究本職領域的前提下多涉獵一下其餘的領域對本身的成長老是有益處的。html
先歸納一下本文的主要內容:前端
先說結論(房產類網站可通用):node
簡單抽取一下具體的爬取步驟,以自如(北京地區)爲例:mysql
經過主頁的佈局,能夠看到房產類的網站基本上都是上方是地標(好比:東城-崇文門),下面是該地標附近的房產信息。因此經過分析這塊的網頁結構就能夠抓到全部的地標信息。git
以自如網站爲例,好比咱們想看安定門的租房信息,直接在首頁的搜索框中輸入「安定門」而後點擊搜索按鈕。程序員
根據上一小節的方法論,開始動手寫代碼。這裏以自如爲例(自如的信息比鏈家難爬,可是原理都是通用的)。github
打開自如首頁,打開 Chrome 的開發者工具,開始分析網頁元素。ajax
let allParentLocation = $('ul.clearfix.filterList', 'dl.clearfix.zIndex6');
for (let i = 1; i < allParentLocation.children().length; i++) {
let parentLocation = allParentLocation.children().eq(i);
let parentLocationText = parentLocation.children().eq(0).text(); // 東城 西城...
let allChildren = $(parentLocation.children().eq(1)).find('a');
for (let j = 1; j <allChildren.length; j++) {
let childrenLocationText = allChildren.eq(j).text(); //子行政區
//TODO 上面的childrenLocationText變量就是地標信息
}
}
複製代碼
如2.1.2所述,自如二級頁面基本上是 baseUrl+地標+page 組成。因此我們能夠完善一下3.1中的代碼。下面咱們封裝一個函數用來解析地標而且生成全部二級頁地址的數組。注:這個函數返回的是一個 Promise ,後面會用 async 函數來組織全部 Promise 。sql
/**
* 獲取行政區
* @param data
* @returns {Promise<any>}
*/
function parseLocationAndInitTargetPath(data) {
let targetPaths = [];
let promise = new Promise(function (resolve, reject) {
let $ = cheerio.load(data);
let allParentLocation = $('ul.clearfix.filterList', 'dl.clearfix.zIndex6');
for (let i = 1; i < allParentLocation.children().length; i++) {
let parentLocation = allParentLocation.children().eq(i);
let parentLocationText = parentLocation.children().eq(0).text(); // 東城 西城...
let allChildren = $(parentLocation.children().eq(1)).find('a');
for (let j = 1; j <allChildren.length; j++) {
let childrenLocationText = allChildren.eq(j).text(); //子行政區
let encodeChildrenLocationText = encodeURI(childrenLocationText);
for (let page = 1; page < 50; page++) { //只獲取前50頁的數據
targetPaths.push(`${basePath}qwd=${encodeChildrenLocationText}&p=${page}`);
}
}
}
resolve(targetPaths);
});
return promise;
}
複製代碼
先觀察一下二級頁的佈局,例如咱們想把圖片、標題、tags、價格這幾個信息抓取下來。 npm
/**
* 解析每一條的數據
*/
async function parseItemData(targetPaths) {
let promises = [];
for (let path of targetPaths) {
let data = await getHtmlSource(path);
let allText = '';
try{
allText = await ziRoomPriceUtil.getTextFromImage(data);
}catch(err){
console.log('抓取失敗--->>> '+path);
continue;
}
let promise = new Promise((resolve, reject) => {
let $ = cheerio.load(data);
let result = $('#houseList');
let allResults = [];
for (let i = 0; i < result.children().length; i++) {
let item = result.children().eq(i);
let imgSrc = $('img', item).attr('src');
let title = $('a', $('.txt', item)).eq(0).text();
let detail = $('a', $('.txt', item)).eq(1).text();
let label = '';
$('span', $('.txt', item)).each(function (i, elem) {
label = label + ' ' + $(this).text();
});
let price = '';
if (allText.length !== 10) {
price = '未抓取到價格信息'+allText;
}else{
let priceContain = $('span', $('.priceDetail', item));
for(let i = 0;i<priceContain.length;i++){
if(i === 0 || i === priceContain.length-1){
price = price +' '+ priceContain.eq(i).text(); //首位: ¥ 末尾: 每個月/每季度
}else {
price = price + ziRoomPriceUtil.style2Price(priceContain.eq(i).attr('style'),allText);
}
}
}
allResults.push({'imgSrc':imgSrc,'title':title,'detail':detail,'label':label,'price':price});
}
resolve(allResults);
});
promises.push(promise);
}
return Promise.all(promises);
}
複製代碼
注意 上面有幾個點須要解釋一下
//自如爬蟲腳本 http://www.ziroom.com/
let schedule = require('node-schedule');
let superagent = require('superagent');
let cheerio = require('cheerio');
let charset = require('superagent-charset'); //解決亂碼問題:
charset(superagent);
let ziRoomPriceUtil = require('../utils/ZiRoomPriceUtil');
var phantom = require("phantom");
var _ph, _page, _outObj;
let basePath = 'http://www.ziroom.com/z/nl/z3.html?';
/**
* 使用phantom獲取網頁源碼
* @param path
* @param callback
*/
function getHtmlSource(path) {
let promise = new Promise(function (resolve, reject) {
phantom.create().then(function (ph) {
_ph = ph;
return _ph.createPage();
}).then(function (page) {
_page = page;
return _page.open(path);
}).then(function (status) {
return _page.property('content')
}).then(function (content) {
resolve(content);
_page.close();
_ph.exit();
}).catch(function (e) {
console.log(e);
});
});
return promise;
}
/**
* 獲取行政區
* @param data
* @returns {Promise<any>}
*/
function parseLocationAndInitTargetPath(data) {
let targetPaths = [];
let promise = new Promise(function (resolve, reject) {
let $ = cheerio.load(data);
let allParentLocation = $('ul.clearfix.filterList', 'dl.clearfix.zIndex6');
for (let i = 1; i < allParentLocation.children().length; i++) {
let parentLocation = allParentLocation.children().eq(i);
let parentLocationText = parentLocation.children().eq(0).text(); // 東城 西城...
let allChildren = $(parentLocation.children().eq(1)).find('a');
for (let j = 1; j <allChildren.length; j++) {
let childrenLocationText = allChildren.eq(j).text(); //子行政區
let encodeChildrenLocationText = encodeURI(childrenLocationText);
for (let page = 1; page < 50; page++) { //只獲取前三頁的數據
targetPaths.push(`${basePath}qwd=${encodeChildrenLocationText}&p=${page}`);
}
}
}
resolve(targetPaths);
});
return promise;
}
/**
* 解析每一條的數據
*/
async function parseItemData(targetPaths) {
let promises = [];
for (let path of targetPaths) {
let data = await getHtmlSource(path);
let allText = '';
try{
allText = await ziRoomPriceUtil.getTextFromImage(data);
}catch(err){
console.log('抓取失敗--->>> '+path);
continue;
}
let promise = new Promise((resolve, reject) => {
let $ = cheerio.load(data);
let result = $('#houseList');
let allResults = [];
for (let i = 0; i < result.children().length; i++) {
let item = result.children().eq(i);
let imgSrc = $('img', item).attr('src');
let title = $('a', $('.txt', item)).eq(0).text();
let detail = $('a', $('.txt', item)).eq(1).text();
let label = '';
$('span', $('.txt', item)).each(function (i, elem) {
label = label + ' ' + $(this).text();
});
let price = '';
if (allText.length !== 10) {
price = '未抓取到價格信息'+allText;
}else{
let priceContain = $('span', $('.priceDetail', item));
for(let i = 0;i<priceContain.length;i++){
if(i === 0 || i === priceContain.length-1){
price = price +' '+ priceContain.eq(i).text(); //首位: ¥ 末尾: 每個月/每季度
}else {
price = price + ziRoomPriceUtil.style2Price(priceContain.eq(i).attr('style'),allText);
}
}
}
allResults.push({'imgSrc':imgSrc,'title':title,'detail':detail,'label':label,'price':price});
}
resolve(allResults);
});
promises.push(promise);
}
return Promise.all(promises);
}
/**
* 初始化目標網頁
*/
async function init() {
let basePathSource = await getHtmlSource(basePath);
let targetPaths = await parseLocationAndInitTargetPath(basePathSource);
let result = await parseItemData(targetPaths);
return result ;
}
/**
* 開始爬取
*/
function startSplider() {
console.log('自如爬蟲已啓動...');
let startTime = new Date();
init().then(function (data) {
let endTime = new Date();
console.log('自如爬蟲執行完畢 共消耗時間'+(endTime - startTime)/1000+'秒');
}, function (error) {
console.log(error);
});
}
startSplider();
// module.exports = {
// startSplider,
// };
複製代碼
let md5=require("md5")
let baiduAiUtil = require('./BaiduAiUtil');
function style2Price(style,allText) {
let position = style.match('[1-9]\\d*')/30;
return allText.substr(position,1);
}
function getTextFromImage(pageSrouce) {
let promise = new Promise(function (resolve, reject) {
try {
let matchStr = pageSrouce.match('static8.ziroom.com/phoenix/pc/images/price/[^\\s]+.png')[0];
let path = `http://${matchStr}`;
baiduAiUtil.identifyImageByUrl(path).then(function(result) {
resolve(result.words_result[0].words);
}).catch(function(err) {
// 若是發生網絡錯誤
reject(err)
});
} catch (err) {
reject(err);
}
});
return promise;
}
module.exports = {
style2Price,
getTextFromImage
}
複製代碼
let fs = require('fs');
let AipOcrClient = require("baidu-aip-sdk").ocr;
// 設置APPID/AK/SK
let APP_ID = "需替換你的 APPID";
let API_KEY = "需替換你的 AK";
let SECRET_KEY = "需替換你的 SK";
// 新建一個對象,建議只保存一個對象調用服務接口
let client = new AipOcrClient(APP_ID, API_KEY, SECRET_KEY);
/**
* 經過本地文件識別數據
* @param imagePath 本地file path
* @returns {Promise}
*/
function identifyImageByFile(imagePath){
let image = fs.readFileSync(imagePath).toString("base64");
return client.generalBasic(image);
}
/**
* 經過遠程url識別數據
* @param url 遠程url地址
* @returns {Promise}
*/
function identifyImageByUrl(url){
return client.generalBasicUrl(url);
}
module.exports = {
identifyImageByUrl,
identifyImageByFile
}
複製代碼
注:這是我存到mysql中的爬取結果,因爲 Node 連接 Mysql 不是本文重點,因此沒貼代碼。你能夠選擇把 startSplider 函數獲取到的結果放到文件裏、MongooDB 或者其餘地方。
這段時間寫了不少各大網站的爬蟲代碼,發現不少工做量是重複的。好比:租房類的網站大部分都是 先爬地標再爬二級頁 這種套路。本着 「以可配置爲榮 以硬編碼爲恥」 的程序員價值觀,後期會考慮把爬蟲模塊作成可配置的。這裏跟你們分享一個開源庫: 牛咖 。
contact way | value |
---|---|
weixinjie1993@gmail.com | |
W2006292 | |
github | github.com/weixinjie |
blog | juejin.im/user/57673c… |