我也來寫個小爬蟲 ^_^

時間 2019-11-29

原文原文鏈接

今天下班抽了點時間看了下印象筆記，整理了一個禮拜node的api筆記。。。。而後去慕課網看了Scott老師講的node系列視頻教程。因而本身寫了一個小小的爬蟲，爬的是本身寫的博客章節 ,裏面的一些es6語法和api我就不一一細說，你們能夠去看文檔，http://nodeapi.ucdok.com/#/api/,好了話很少說，直接上代碼
html

'use strict';
{
    const http = require(`http`);
    
    const cheerio = require(`cheerio`);

    const fs = require(`fs`);

    let url = `http://www.cnblogs.com/tween`;

    http.get(url, (res) => {

        let content = ``;

        res.on(`data`, (data) => {

            content += data;

        }).on(`end`, () => {

            let html = getContent(content);

            creatTxt(html);

        });

    }).on(`error`,() => console.log(`獲取數據失敗`));

    let creatTxt = content => {

        let txt = ``;

        for(let v of content){
            txt += v.time;
            let blog = v.blog;
            for(let v of blog){
                let json = v;
                for(let name in json){
                    txt += json[name];
                }
                txt += `\n`;
            }
        }
        
        fs.writeFile(`blog.txt`,txt,'utf-8',(err) => {

            err?console.log(err):console.log(`寫入成功`);

        });

    };
    let getContent = content => {

        let $ = cheerio.load(content);

        let blogs = $(`.day`);

        let arr = [];

        blogs.each( (index, item) => {

            let _this = $(item);

            let time = _this.find(`.dayTitle`).text();

            let indexBlog = [];

            _this.find(`.postTitle`).each((index, item) => {

                let title = $(item).text().trim();

                let list = _this.find(`.postDesc`).eq(index).text();

                let read = list.match(/\(\d+\)/g)[0].trim();

                let comment = list.match(/\(\d+\)/g)[1].trim();

                indexBlog[index] = {
                    title:`\t${title}\n`,
                    read:`\t閱讀:${read} 評論:${comment}\n`,
                };
            });
            arr[index] = {
                time:`${index+1} 、${time.trim()}\n`,
                blog:indexBlog
            };

        });
        return arr;
    };
}

運行後會在同目錄下建立個blog.txt，裏面的內容就是爬到的數據node

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。