爲了抓取和訊網高管增減持的數據,首先得分析一下數據的來源:javascript
網址: http://stockdata.stock.hexun.com/ggzjc/history.shtmlhtml
使用chrome開發者工具,能夠發如今切換到第二頁時,瀏覽器向下述地址發起了網絡訪問請求:java
http://stockdata.stock.hexun.com/ggzjc/data/ChangeHistory.aspx?count=30&page=2&callback=hxbase_json5node
分析一下上述連接, count表示一頁返回的結果數目,page表明頁碼數,callback表示回調函數的名稱.python
如下是發起上述URL對應的網絡請求返回的數據:mysql
很明顯,這是一段javascript代碼,不是json數據,沒法使用python進行直接解析.爲了加快項目進度,減小耦合,可使用nodejs一步完成,不用將這個數據爬取分爲抓取和解析兩個步驟.git
爲了加快爬取速度,咱們設置每發起一次請求,返回1000條數據,在給定頁碼範圍的狀況下,就能夠生成由全部連接構成的數組:github
function get_url_array(start, end) { var url_template = "http://stockdata.stock.hexun.com/ggzjc/data/ChangeHistory.aspx?count=1000&page=%d&callback=hxbase_json5" var util = require("util") var array = new Array() for (var i = start; i <= end; i++) { var url_one = util.format(url_template, i + 1) array.push(url_one) } return array }
對於給定連接,獲取該連接的數據並將其轉換爲javascript對象,取出其中有價值的數據list,對應函數:正則表達式
function get_data_from_url(url) { var request = require('sync-request'); var user_agent_list =[ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56', "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36" ] var pos =randomIntRange(0,user_agent_list.length-1) // 增長user-agent var res = request('GET', url, { 'headers': { 'user-agent':user_agent_list[pos], 'Host': 'stockdata.stock.hexun.com', 'Referer': 'http://stockdata.stock.hexun.com/ggzjc/history.shtml' }, retry : true, retryDelay: 10000, maxRetries: 5, timeout:200000 }); var buf = res.getBody() var iconv = require("iconv-lite") // 使用gb2312編碼方式 var data_str = iconv.decode(buf, 'gb2312') data_str = data_str.replace(/上海市浦東新區公共交通投資發\\/g,"上海市浦東新區公共交通投資發") var data_list = eval(data_str) return data_list.list }
這裏面有一個小坑,在大概處理第14個連接的時候,服務器返回的數據並非正確的javascript腳本,在此處有錯誤:sql
//wrong ! changePeopleTitle: '上海市浦東新區公共交通投資發\' //right changePeopleTitle: '上海市浦東新區公共交通投資發'
就由於多了一個轉義符號,致使整個語句有問題,不能正確利用eval函數進行轉換.這背後確定是某位mm手殘的結果.因此須要對這個bug特殊處理,對應上述代碼的標紅部分.這個抓取程序須要正確設置user-agent,爲了防止被卡,我設置了user-agent池,利用random函數隨機選取user-agent.
//隨機生成範圍在low,high之間的隨機數 function randomIntRange (low, high) { return Math.floor(Math.random() * (high - low + 1) + low); }
爲了解析javascript語句,須要設置和請求對應的回調函數,以下:
function hxbase_json5(str) { var data = eval(str) return data }
解析javascript對象並將其存入數據庫的操做定義在函數save_data_to_mysql() 中,其中利用了sequelize的orm模型來簡化實現.
function save_data_to_mysql() { var Sequelize = require('sequelize') var sleep = require("sleep") var sequelize = new Sequelize( 'dbname', 'root', 'passwd', { 'dialect': 'mysql', 'host': '127.0.0.1', 'port': 3306, define: { charset: 'utf8', timestamps: false //不定義時間戳 } } ) //高管增減持 var Ggzjc = sequelize.define( 'table_name', { 'stock_code': {//股票代碼 'type': Sequelize.STRING, 'allowNull': false, 'unique': false }, 'stock_name': {//股票名稱 'type': Sequelize.STRING, 'allowNull': false, 'unique': false }, 'changeDate': {//變更日期 'type': Sequelize.DATEONLY, 'allowNull': true }, 'noticeDate': {//公告日期 'type': Sequelize.DATEONLY, 'allowNull': true }, 'changeNum': {// 變更數量 萬股 'type': Sequelize.DOUBLE, 'allowNull': true }, 'averagePrice': {//均價 'type': Sequelize.DOUBLE, 'allowNull': true }, 'price': {//金額 'type': Sequelize.DOUBLE, 'allowNull': true }, 'shareHoldingNum': {//變更後持股數目 'type': Sequelize.DOUBLE, 'allowNull': true }, 'changeRatio': {//變更人變更比 'type': Sequelize.DOUBLE, 'allowNull': true }, 'circulationCapitalRatio': {//佔流通股本比例 'type': Sequelize.DOUBLE, 'allowNull': true }, 'changeWay': {//變更方式 'type': Sequelize.STRING, 'allowNull': true }, 'changePeople': {//股份變更人 'type': Sequelize.STRING, 'allowNull': true }, 'changePeopleTitle': {//相關董事高管 'type': Sequelize.STRING, 'allowNull': true }, 'duties': {//職務 'type': Sequelize.STRING, 'allowNull': true }, 'relation': {//關係 'type': Sequelize.STRING, 'allowNull': true }, 'industry': {//行業 'type': Sequelize.STRING, 'allowNull': true }, } ) Ggzjc.sync({force: true}).then(function () { var url_array = get_url_array(1, 58) for (var i = 0; i < url_array.length; i++) { var data = get_data_from_url(url_array[i]) sleep.usleep(200000) print(i + 1) print('complete!') for (var j = 0; j < data.length; j++) { var changeDate = '20' + data[j].changeDate var noticeDate = '20' + data[j].noticeDate var str_array = data[j].stockName.split("(") var stock_name = str_array[0] str_array = str_array[1].split(")") var stock_code = str_array[0] var changeNum = get_content_from_html(data[j].changeNum) if (changeNum != null) { if (changeNum == "") changeNum = null else changeNum = parseFloat(changeNum) } var averagePrice = data[j].averagePrice if (averagePrice == ' ') averagePrice = null else averagePrice = parseFloat(averagePrice) var price = get_content_from_html(data[j].price) if (price != null) { if (price == '') price = null else price = parseFloat(price) } var shareHoldingNum = data[j].shareHoldingNum if (shareHoldingNum == ' ') shareHoldingNum = null else shareHoldingNum = parseFloat(shareHoldingNum) var changeRatio = data[j].changeRatio if (changeRatio == ' ') changeRatio = null else changeRatio = parseFloat(changeRatio) var circulationCapitalRatio = data[j].circulationCapitalRatio if (circulationCapitalRatio == ' ') circulationCapitalRatio = null else circulationCapitalRatio = parseFloat(circulationCapitalRatio) var changeWay = data[j].changeWay if (changeWay == ' ') changeWay = null var changePeople = data[j].changePeople // console.log(data[j].changePeople) if (changePeople == ' ') changePeople = null var changePeopleTitle = data[j].changePeopleTitle if (changePeopleTitle == ' ') changePeopleTitle = null var duties = get_content_from_html(data[j].duties) if (duties != null && duties == '') duties = null var relation = get_content_from_html(data[j].relation) if (relation != null && relation == '') relation = null var industry = data[j].industry if (industry == ' ') industry = null var one = Ggzjc.build({ 'stock_code': stock_code, 'stock_name': stock_name, 'changeDate': changeDate, 'noticeDate': noticeDate, 'changeNum': changeNum, 'averagePrice': averagePrice, 'price': price, 'shareHoldingNum': shareHoldingNum, 'changeRatio': changeRatio, 'circulationCapitalRatio': circulationCapitalRatio, 'changeWay': changeWay, 'changePeople': changePeople, 'changePeopleTitle': changePeopleTitle, 'duties': duties, 'relation': relation, 'industry': industry }) one.save() } } }) }
須要解析並獲取html標籤<tag>content</tag>中的content,利用正則表達式取出><中間的文本就能夠了.
function get_content_from_html(str) { var pattern = />[\s\S]+?</g var res = str.match(pattern) if (res == null) { return null } var result = res[0] return result.slice(1, result.length - 1) }
done!
附註:
藉助python execjs和pandas,我實現了以更加優美的姿式爬取上述內容,代碼詳見個人github:
https://github.com/zhoudayang/get_hexun