前記:
想寫一個電影天堂的爬蟲,由於node很長時間落下,就想用node去寫一下。結果遇到了一些列的問題,這些問題歸根到底就是異步流程控制的問題,在之前就一直會接觸到不少回調地獄,Promise爲何會出現諸如此類的話題,如今終因而深入體會到了!html
開始的代碼是:node
const cheerio = require('cheerio'); const http = require('http'); const iconv = require('iconv-lite'); let baseUrl = "http://www.ygdy8.net/html/gndy/dyzz/list_23_"; let Host = "http://www.ygdy8.net/"; let titleHref = []; const totalPage = 1; //指定爬多少頁數據 let res = []; //獲取頁面電影數據 function getTitleHref(url,page) { let startUrl = url+page+".html"; http.get(startUrl,function(res) { let chunks = []; res.on('data',function(chunk){ chunks.push(chunk); }); res.on('end',function(){ let title = []; let html = iconv.decode(Buffer.concat(chunks),'gb2312'); let $ = cheerio.load(html, {decodeEntities: false}); // console.log($); $('.co_content8 .ulink').each(function(i,d) { let $d = $(d); titleHref.push({ href: $d.attr('href') }); }); console.log(titleHref); }); if(page <= totalPage) { getTitleHref(url,++page); }else { console.log(page); getLink(titleHref); } }); } //獲取種子連接 function getLink(titleHref) { console.log('進入getLink'); titleHref.forEach(function(v,k) { console.log('~~~~~~~~~~~~~~~~~~~~'); let infoUrl = Host + v.href; console.log(infoUrl); // try { http.get(infoUrl,function(res) { console.log('進入getlink http'); let chunks = []; res.on('data',function(chunk) { chunks.push(chunk); }); res.on('end', function(){ let html = iconv.decode(Buffer.concat(chunks),'gb2312'); let $ = cheerio.load(html, {decodeEntities: false}); let reg = /.*譯 名/; let info = ''; let bt = ''; let textInfo = $('.co_content8 #Zoom p').eq(0).text(); info = textInfo.match(reg)[0]; bt = $('#Zoom td').children('a').attr('href'); res.push({ Info:info, Bt:bt }); console.log(res); }) //怎麼捕獲錯誤!!! //res.on('error',function(){ // console.log('error'); //}) }) // }catch(e) { // console.log(e); // } }); }; getTitleHref(baseUrl,1)
因此寫node代碼切記大多數都是異步的,上面代碼就出了一個問題:
git
當前代碼就不能保證下面的代碼, 在 res.end 後執行,由於res.end在異步隊列裏可能沒執行完,就進入了下面的if,就算最後進入getLink後就會出現titleHref.forEach進不去的狀況的,由於titleHref是空的。github
當時遇到這個問題若是不考慮到異步流程控制的解決流程的話,一個解決方案是在each函數裏,獲取到一個titleHref就getLink下,titileHref定義成局部函數,getLink函數放在each裏面,這樣就保證titleHref不會是空的了。而後代碼以下:數據庫
const cheerio = require('cheerio'); const http = require('http'); const iconv = require('iconv-lite'); let baseUrl = "http://www.ygdy8.net/html/gndy/dyzz/list_23_"; let Host = "http://www.ygdy8.net/"; const totalPage = 2; //指定爬多少頁數據 let ans = []; //獲取頁面電影數據 function getTitleHref(url,page) { let startUrl = url+page+".html"; http.get(startUrl,function(res) { const { statusCode } = res; let chunks = []; res.on('data',function(chunk){ chunks.push(chunk); }); res.on('end',function(){ let title = []; let html = iconv.decode(Buffer.concat(chunks),'gb2312'); let $ = cheerio.load(html, {decodeEntities: false}); // console.log($); $('.co_content8 .ulink').each(function(i,d) { let $d = $(d); let titleHref = []; titleHref.push({ href: $d.attr('href') }); getLink(titleHref); }); // console.log(ans); }); }); } // /* //獲取種子連接 function getLink(titleHref) { console.log('進入getLink'); console.log(titleHref); if(titleHref) { titleHref.forEach(function(v,k) { console.log('~~~~~~~~~~~~~~~~~~~~'); let infoUrl = Host + v.href; // console.log(infoUrl); http.get(infoUrl,function(res) { const { statusCode } = res; const contentType = res.headers['content-type']; let error; if (statusCode !== 200) { error = new Error('請求失敗。\n' + `狀態碼: ${statusCode}`); } if (error) { console.error(error.message); // 消耗響應數據以釋放內存 res.resume(); return; } console.log('進入getlink http'); let chunks = []; res.on('data',function(chunk) { chunks.push(chunk); }); res.on('end', function(){ try { let html = iconv.decode(Buffer.concat(chunks),'gb2312'); let $ = cheerio.load(html, {decodeEntities: false}); let bt = ''; bt = $('#Zoom td').children('a').attr('href'); // console.log(bt); // console.log(typeof bt) ans.push(bt); // cb(ans); }catch (e) { console.error('bt',e.message); } }) }).on('error', (e) => { console.error(`錯誤: ${e.message}`); }); }); } }; // */ for(let i = 1; i <= totalPage; i++) { getTitleHref(baseUrl,i); console.log(ans); };
可是這樣的代碼你還會發現一個問題,咱們最後保存的bt連接的ans結果,打印的仍是空的,一樣是異步的問題,咱們若是要存入數據庫或者須要ans數據的話,咱們不知道什麼時候返回了這個數據。數組
因此最終咱們仍是要用到ES6/7提出的方案Promise和async/await。
修改以後代碼以下:異步
const cheerio = require('cheerio') const http = require('http') const iconv = require('iconv-lite') const baseUrl = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_' const Host = 'http://www.ygdy8.net/' const totalPage = 2 //指定爬多少頁數據 let ans = [] //獲取頁面電影數據 function getTitleHref(url, page) { return new Promise((resolve, reject) => { let startUrl = url + page + '.html' http.get(startUrl, function(res) { const { statusCode } = res let chunks = [] res.on('data', function(chunk) { chunks.push(chunk) }) res.on('end', function() { let title = [] let html = iconv.decode(Buffer.concat(chunks), 'gb2312') let $ = cheerio.load(html, { decodeEntities: false }) let titleHref = [] $('.co_content8 .ulink').each(function(i, d) { let $d = $(d) titleHref.push({ href: $d.attr('href') }) }) resolve(getLink(titleHref)) }) }) }) } // /* //獲取種子連接 function getLink(titleHref, cb) { console.log('進入getLink') console.log(titleHref) if (titleHref) { return Promise.all( titleHref.map(function(v, k) { return new Promise((resolve, reject) => { console.log('~~~~~~~~~~~~~~~~~~~~') let infoUrl = Host + v.href http .get(infoUrl, function(res) { const { statusCode } = res const contentType = res.headers['content-type'] let error if (statusCode !== 200) { error = new Error('請求失敗。\n' + `狀態碼: ${statusCode}`) } if (error) { console.error(error.message) // 消耗響應數據以釋放內存 res.resume() return } let chunks = [] res.on('data', function(chunk) { chunks.push(chunk) }) res.on('end', function() { try { let html = iconv.decode(Buffer.concat(chunks), 'gb2312') let $ = cheerio.load(html, { decodeEntities: false }) let bt = '' bt = $('#Zoom td') .children('a') .attr('href') resolve(bt) } catch (e) { reject(e) } }) }) .on('error', e => { reject(e) }) }) }) ) } else { return Promise.resolve() } } async function main() { // */ let results = await Promise.all( new Array(totalPage).fill().map((_, i) => getTitleHref(baseUrl, i + 1)) ) ans = ans.concat(...results) console.log('get data:', ans) } main()
每一個函數都封裝成Promise,最後在主函數中用await強制同步獲得最後的結果results。(注意:1。new Array出來的是稀疏數組empty,最後fill()一下填充成undefine,2。47行傳遞的已經不是一個只有一條數據的數組了,而是將一個頁面each執行完成後的彙總,因此在函數內部會有Promise.all
3。93行則聊勝於無,即便return null也會正確的觸發resolve的,這麼寫只是提升一些可讀性罷了。)async
Promise和async/await整理能夠看個人這篇博客Promise和async/await用法整理ide
代碼:github傳送函數
此次告訴我實踐很重要!要把所學和書中所看運用到業務和代碼邏輯中!