Golang 網絡爬蟲框架gocolly/colly 三

Golang 網絡爬蟲框架gocolly/colly

熟悉了Golang 網絡爬蟲框架gocolly/colly 一》Golang 網絡爬蟲框架gocolly/colly 二》以後就能夠在網絡上爬取大部分數據了。本文接下來將爬取中證指數有限公司提供的行業市盈率。(http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratiohtml

 

定義數據結構體,git

//證監會行業市盈率

type ZhjhHyShyl struct {

    Hydm string `json:"行業代碼"`

    Hymc string `json:"行業名稱"`

    Zxsj *float64 `json:"最新數據"`

    Gpjs int `json:"股票家數"`

    Ksjs int `json:"虧損家數"`

    Jygy *float64 `json:"近一個月"`

    Jsgy *float64 `json:"近三個月"`

    Jlgy *float64 `json:"近六個月"`

    Jyn *float64 `json:"近一年"`

    Zhy []*ZhjhHyShyl `json:"細分行業"`

}

  

 

 

接下來爲gocolly調用準備,將用戶代理設置爲Chrome瀏覽器,該值能夠經過Fiddler工具查看github

 

	c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"

 

  

 

 

還能夠利用Fiddler設置更多的Request Header,將爬蟲工具假裝成瀏覽器。json

 

接下來F12調用瀏覽器調試器查看目標數據的元素,拷貝jQuery選擇器,而後改爲相對路徑。瀏覽器

 

 

 

完成全部的數據抓取代碼:網絡

 

package main

 

import (

    "encoding/json"

    "fmt"

    "log"

    "strconv"

    "strings"

 

    "github.com/PuerkitoBio/goquery"

 

    "github.com/gocolly/colly"

)

 

//證監會行業市盈率

type ZhjhHyShyl struct {

    Hydm string `json:"行業代碼"`

    Hymc string `json:"行業名稱"`

    Zxsj *float64 `json:"最新數據"`

    Gpjs int `json:"股票家數"`

    Ksjs int `json:"虧損家數"`

    Jygy *float64 `json:"近一個月"`

    Jsgy *float64 `json:"近三個月"`

    Jlgy *float64 `json:"近六個月"`

    Jyn *float64 `json:"近一年"`

    Zhy []*ZhjhHyShyl `json:"細分行業"`

}

 

func main() {

 

    var err error

    c := colly.NewCollector()

    c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"

 

    zjhHyShyl := make([]*ZhjhHyShyl, 0)

    c.OnRequest(func(r *colly.Request) {

        fmt.Printf("%+v\r\n%+v\r\n", *r, *(r.Headers))

    })

 

    c.OnHTML("td>table.list-div-table>tbody>tr", func(e *colly.HTMLElement) {

 

        hyShy := ZhjhHyShyl{

            Hydm: e.ChildText("td:first-child"),

            Hymc: e.ChildText("td:nth-child(2)"),

        }

        zxsj, err := strconv.ParseFloat(e.ChildText("td:nth-child(3)"), 64)

        if err == nil {

            hyShy.Zxsj = &zxsj

        }

 

        gpjs, err := strconv.ParseInt(e.ChildText("td:nth-child(4)"), 10, 32)

        if err == nil {

            hyShy.Gpjs = int(gpjs)

        }

        ksjs, err := strconv.ParseInt(e.ChildText("td:nth-child(5)"), 10, 32)

        if err == nil {

            hyShy.Ksjs = int(ksjs)

        }

 

        jygy, err := strconv.ParseFloat(e.ChildText("td:nth-child(6)"), 64)

        if err == nil {

            hyShy.Jygy = &jygy

        }

        jsgy, err := strconv.ParseFloat(e.ChildText("td:nth-child(7)"), 64)

        if err == nil {

            hyShy.Jsgy = &jsgy

        }

        jlgy, err := strconv.ParseFloat(e.ChildText("td:nth-child(8)"), 64)

        if err == nil {

            hyShy.Jlgy = &jlgy

        }

        jyn, err := strconv.ParseFloat(e.ChildText("td:nth-child(9)"), 64)

        if err == nil {

            hyShy.Jyn = &jyn

        }

        zjhHyShyl = append(zjhHyShyl, &hyShy)

 

        hyShy.Zhy = make([]*ZhjhHyShyl, 0)

        e.DOM.Parent().Parent().Next().Find("table.list-div-table>tbody>tr").Each(func(_ int, s *goquery.Selection) {

            zhy := ZhjhHyShyl{

                Hydm: strings.Trim(s.Find("td:nth-child(1)").Text(), "\r\n\t "),

                Hymc: strings.Trim(s.Find("td:nth-child(2)").Text(), "\r\n\t "),

            }

 

            zxsj, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(3)").Text(), "\r\n\t "), 64)

            if err == nil {

                zhy.Zxsj = &zxsj

            }

 

            gpjs, err := strconv.ParseInt(strings.Trim(s.Find("td:nth-child(4)").Text(), "\r\n\t "), 10, 32)

            if err == nil {

                zhy.Gpjs = int(gpjs)

            }

            ksjs, err := strconv.ParseInt(strings.Trim(s.Find("td:nth-child(5)").Text(), "\r\n\t "), 10, 32)

            if err == nil {

                zhy.Ksjs = int(ksjs)

            }

 

            jygy, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(6)").Text(), "\r\n\t "), 64)

            if err == nil {

                zhy.Jygy = &jygy

            }

            jsgy, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(7)").Text(), "\r\n\t "), 64)

            if err == nil {

                zhy.Jsgy = &jsgy

            }

            jlgy, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(8)").Text(), "\r\n\t "), 64)

            if err == nil {

                zhy.Jlgy = &jlgy

            }

            jyn, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(9)").Text(), "\r\n\t "), 64)

            if err == nil {

                zhy.Jyn = &jyn

            }

 

            hyShy.Zhy = append(hyShy.Zhy, &zhy)

 

        })

    })

 

    c.OnScraped(func(_ *colly.Response) {

 

        bData, _ := json.MarshalIndent(zjhHyShyl, "", "\t")

        fmt.Println(string(bData))

 

    })

 

    err = c.Visit("http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?date=2017-12-27&type=zjh1")

    if err != nil {

        log.Fatal(err)

    }

 

}

 

  

 

運行後的部分結果:數據結構

        

{

                "行業代碼": "D",

                "行業名稱": "電力、熱力、燃氣及水的生產和供應業",

                "最新數據": 20.12,

                "股票家數": 107,

                "虧損家數": 5,

                "近一個月": 19.51,

                "近三個月": 19.7,

                "近六個月": 19.87,

                "近一年": 18.9,

                "細分行業": [

                        {

                                "行業代碼": "44",

                                "行業名稱": "電力、熱力生產和供應業",

                                "最新數據": 18.75,

                                "股票家數": 70,

                                "虧損家數": 3,

                                "近一個月": 18.28,

                                "近三個月": 18.43,

                                "近六個月": 18.55,

                                "近一年": 17.44,

                                "細分行業": null

                        },

                        {

                                "行業代碼": "45",

                                "行業名稱": "燃氣生產和供應業",

                                "最新數據": 28.4,

                                "股票家數": 22,

                                "虧損家數": 2,

                                "近一個月": 25.71,

                                "近三個月": 25.33,

                                "近六個月": 25.38,

                                "近一年": 27.24,

                                "細分行業": null

                        },

                        {

                                "行業代碼": "46",

                                "行業名稱": "水的生產和供應業",

                                "最新數據": 27.78,

                                "股票家數": 15,

                                "虧損家數": 0,

                                "近一個月": 27.88,

                                "近三個月": 29.33,

                                "近六個月": 30.56,

                                "近一年": 29.64,

                                "細分行業": null

                        }

                ]

        },

 

轉載請註明出處:http://www.cnblogs.com/majianguo/p/8150060.htmlapp

相關文章
相關標籤/搜索