使用nodejs爬取和訊網高管增減持數據

  爲了抓取和訊網高管增減持的數據,首先得分析一下數據的來源:javascript

  網址: http://stockdata.stock.hexun.com/ggzjc/history.shtmlhtml

  使用chrome開發者工具,能夠發如今切換到第二頁時,瀏覽器向下述地址發起了網絡訪問請求:java

  http://stockdata.stock.hexun.com/ggzjc/data/ChangeHistory.aspx?count=30&page=2&callback=hxbase_json5node

  分析一下上述連接, count表示一頁返回的結果數目,page表明頁碼數,callback表示回調函數的名稱.python

  如下是發起上述URL對應的網絡請求返回的數據:mysql

  很明顯,這是一段javascript代碼,不是json數據,沒法使用python進行直接解析.爲了加快項目進度,減小耦合,可使用nodejs一步完成,不用將這個數據爬取分爲抓取和解析兩個步驟.git

  爲了加快爬取速度,咱們設置每發起一次請求,返回1000條數據,在給定頁碼範圍的狀況下,就能夠生成由全部連接構成的數組:github

function get_url_array(start, end) {
    var url_template = "http://stockdata.stock.hexun.com/ggzjc/data/ChangeHistory.aspx?count=1000&page=%d&callback=hxbase_json5"
    var util = require("util")
    var array = new Array()
    for (var i = start; i <= end; i++) {
        var url_one = util.format(url_template, i + 1)
        array.push(url_one)
    }
    return array
}

  對於給定連接,獲取該連接的數據並將其轉換爲javascript對象,取出其中有價值的數據list,對應函數:正則表達式

function get_data_from_url(url) {
    var request = require('sync-request');
    var user_agent_list =[
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56',
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
    ]
    var pos =randomIntRange(0,user_agent_list.length-1)
    // 增長user-agent
    var res = request('GET', url, {
        'headers': {
            'user-agent':user_agent_list[pos],
            'Host': 'stockdata.stock.hexun.com',
            'Referer': 'http://stockdata.stock.hexun.com/ggzjc/history.shtml'
        },
        retry : true,
        retryDelay: 10000,
        maxRetries: 5,
        timeout:200000
    });
    var buf = res.getBody()
    var iconv = require("iconv-lite")
    // 使用gb2312編碼方式
    var data_str = iconv.decode(buf, 'gb2312')
    data_str = data_str.replace(/上海市浦東新區公共交通投資發\\/g,"上海市浦東新區公共交通投資發")
    var data_list = eval(data_str)
    return data_list.list
}

  這裏面有一個小坑,在大概處理第14個連接的時候,服務器返回的數據並非正確的javascript腳本,在此處有錯誤:sql

//wrong !
changePeopleTitle: '上海市浦東新區公共交通投資發\'
//right
changePeopleTitle: '上海市浦東新區公共交通投資發'

  就由於多了一個轉義符號,致使整個語句有問題,不能正確利用eval函數進行轉換.這背後確定是某位mm手殘的結果.因此須要對這個bug特殊處理,對應上述代碼的標紅部分.這個抓取程序須要正確設置user-agent,爲了防止被卡,我設置了user-agent池,利用random函數隨機選取user-agent.

//隨機生成範圍在low,high之間的隨機數
function randomIntRange (low, high) {
    return Math.floor(Math.random() * (high - low + 1) + low);
}

  爲了解析javascript語句,須要設置和請求對應的回調函數,以下:

function hxbase_json5(str) {
    var data = eval(str)
    return data
}

  解析javascript對象並將其存入數據庫的操做定義在函數save_data_to_mysql() 中,其中利用了sequelize的orm模型來簡化實現.

function save_data_to_mysql() {

    var Sequelize = require('sequelize')
    var sleep = require("sleep")
    var sequelize = new Sequelize(
        'dbname',
        'root',
        'passwd',
        {
            'dialect': 'mysql',
            'host': '127.0.0.1',
            'port': 3306,
            define: {
                charset: 'utf8',
                timestamps: false
                //不定義時間戳
            }
        }
    )
    //高管增減持
    var Ggzjc = sequelize.define(
        'table_name', {
            'stock_code': {//股票代碼
                'type': Sequelize.STRING,
                'allowNull': false,
                'unique': false
            },
            'stock_name': {//股票名稱
                'type': Sequelize.STRING,
                'allowNull': false,
                'unique': false
            },
            'changeDate': {//變更日期
                'type': Sequelize.DATEONLY,
                'allowNull': true
            },
            'noticeDate': {//公告日期
                'type': Sequelize.DATEONLY,
                'allowNull': true
            },
            'changeNum': {// 變更數量 萬股
                'type': Sequelize.DOUBLE,
                'allowNull': true
            },
            'averagePrice': {//均價
                'type': Sequelize.DOUBLE,
                'allowNull': true
            },
            'price': {//金額
                'type': Sequelize.DOUBLE,
                'allowNull': true
            },
            'shareHoldingNum': {//變更後持股數目
                'type': Sequelize.DOUBLE,
                'allowNull': true
            },
            'changeRatio': {//變更人變更比
                'type': Sequelize.DOUBLE,
                'allowNull': true
            },
            'circulationCapitalRatio': {//佔流通股本比例
                'type': Sequelize.DOUBLE,
                'allowNull': true
            },
            'changeWay': {//變更方式
                'type': Sequelize.STRING,
                'allowNull': true
            },
            'changePeople': {//股份變更人
                'type': Sequelize.STRING,
                'allowNull': true
            },
            'changePeopleTitle': {//相關董事高管
                'type': Sequelize.STRING,
                'allowNull': true
            },
            'duties': {//職務
                'type': Sequelize.STRING,
                'allowNull': true
            },
            'relation': {//關係
                'type': Sequelize.STRING,
                'allowNull': true
            },
            'industry': {//行業
                'type': Sequelize.STRING,
                'allowNull': true
            },
        }
    )

    Ggzjc.sync({force: true}).then(function () {
        var url_array = get_url_array(1, 58)
        for (var i = 0; i < url_array.length; i++) {
            var data = get_data_from_url(url_array[i])
            sleep.usleep(200000)
            print(i + 1)
            print('complete!')
            for (var j = 0; j < data.length; j++) {
                var changeDate = '20' + data[j].changeDate
                var noticeDate = '20' + data[j].noticeDate
                var str_array = data[j].stockName.split("(")
                var stock_name = str_array[0]
                str_array = str_array[1].split(")")
                var stock_code = str_array[0]
                var changeNum = get_content_from_html(data[j].changeNum)
                if (changeNum != null) {
                    if (changeNum == "")
                        changeNum = null
                    else
                        changeNum = parseFloat(changeNum)
                }
                var averagePrice = data[j].averagePrice
                if (averagePrice == '&nbsp;')
                    averagePrice = null
                else
                    averagePrice = parseFloat(averagePrice)
                var price = get_content_from_html(data[j].price)
                if (price != null) {
                    if (price == '')
                        price = null
                    else
                        price = parseFloat(price)
                }
                var shareHoldingNum = data[j].shareHoldingNum
                if (shareHoldingNum == '&nbsp;')
                    shareHoldingNum = null
                else
                    shareHoldingNum = parseFloat(shareHoldingNum)
                var changeRatio = data[j].changeRatio
                if (changeRatio == '&nbsp;')
                    changeRatio = null
                else
                    changeRatio = parseFloat(changeRatio)
                var circulationCapitalRatio = data[j].circulationCapitalRatio
                if (circulationCapitalRatio == '&nbsp;')
                    circulationCapitalRatio = null
                else
                    circulationCapitalRatio = parseFloat(circulationCapitalRatio)
                var changeWay = data[j].changeWay
                if (changeWay == '&nbsp;')
                    changeWay = null
                var changePeople = data[j].changePeople
                // console.log(data[j].changePeople)
                if (changePeople == '&nbsp;')
                    changePeople = null
                var changePeopleTitle = data[j].changePeopleTitle
                if (changePeopleTitle == '&nbsp;')
                    changePeopleTitle = null
                var duties = get_content_from_html(data[j].duties)
                if (duties != null && duties == '')
                    duties = null
                var relation = get_content_from_html(data[j].relation)
                if (relation != null && relation == '')
                    relation = null
                var industry = data[j].industry
                if (industry == '&nbsp;')
                    industry = null
                var one = Ggzjc.build({
                    'stock_code': stock_code,
                    'stock_name': stock_name,
                    'changeDate': changeDate,
                    'noticeDate': noticeDate,
                    'changeNum': changeNum,
                    'averagePrice': averagePrice,
                    'price': price,
                    'shareHoldingNum': shareHoldingNum,
                    'changeRatio': changeRatio,
                    'circulationCapitalRatio': circulationCapitalRatio,
                    'changeWay': changeWay,
                    'changePeople': changePeople,
                    'changePeopleTitle': changePeopleTitle,
                    'duties': duties,
                    'relation': relation,
                    'industry': industry
                })
                one.save()
            }

       
        }

    })
}

  須要解析並獲取html標籤<tag>content</tag>中的content,利用正則表達式取出><中間的文本就能夠了.

function get_content_from_html(str) {
    var pattern = />[\s\S]+?</g
    var res = str.match(pattern)
    if (res == null) {
        return null
    }
    var result = res[0]
    return result.slice(1, result.length - 1)
}

 done!

附註:

  藉助python execjs和pandas,我實現了以更加優美的姿式爬取上述內容,代碼詳見個人github:

  https://github.com/zhoudayang/get_hexun

相關文章
相關標籤/搜索