很久沒使用go語言作個項目了,上午閒來無事花了點時間使用golang來爬取豆瓣top電影,這裏我沒有用colly框架而是本身設計簡單流程。mark一下git
定義兩個channel,一個channel存放web網頁源內容,另外一個存放提取後的有效內容。github
多個goroutine併發爬取網頁源內容放到存放web網頁的channel裏,再啓動goroutine去存放web網頁的channel裏讀取內容,讀取到內容後啓動goroutine去提取有效值存放到channel裏,最後持久化寫入本地文件(文件寫操做並不是線程安全因此這裏我沒有使用多goroutine)。golang
具體代碼以下,若是想執行看效果須要go get github.com/PuerkitoBio/goquery
安裝三方包或者直接點這裏拉取代碼設置GOPATH以後即可運行web
package main import ( "encoding/json" "fmt" "io/ioutil" "net/http" "regexp" "strings" "time" "os" "github.com/PuerkitoBio/goquery" ) /* 經過多個goroutine併發執行爬取操做,channel存放要爬取url內容和爬取結果 這樣只須要設計爬取函數和提取函數 */ func get_web_content(url string, chan_web chan string) { resp, err := http.Get(url) if err != nil { fmt.Println("http get error", err) return } body, err := ioutil.ReadAll(resp.Body) if err != nil { fmt.Println("read error", err) return } chan_web <- string(body) } func extract_valid_content(body string, chan_r chan []byte) { dom, err := goquery.NewDocumentFromReader(strings.NewReader(body)) if err != nil { fmt.Println(err) } dom.Find("ol.grid_view div.item").Each(func(i int, selection *goquery.Selection) { // extract result result := make(map[string]string) name := selection.Find("div.info span.title").First().Text() doctor_str := selection.Find("div.info div.bd p").First().Text() r := regexp.MustCompile(`導演:(?s:(.*?))(主演|主|&|\.\.\.)`) doctor := r.FindAllStringSubmatch(doctor_str, -1)[0][1] rating_num := selection.Find("div.star span.rating_num").First().Text() evaluation_str := selection.Find("div.star span").Last().Text() r = regexp.MustCompile(`(?s:(.*?))人評價`) evaluation := r.FindAllStringSubmatch(evaluation_str, -1)[0][1] ranking := selection.Find("div.pic em").First().Text() result["name"] = name result["doctor"] = doctor result["rating_num"] = rating_num result["evaluation"] = evaluation result["ranking"] = ranking json_str, err := json.Marshal(result) if err != nil { fmt.Println(err) return } chan_r <- json_str }) } func main() { var ( OutputFile = "./film_crawl.txt" ) base_url := "https://movie.douban.com/top250?start=%d&filter=" chan_web_content := make(chan string) defer close(chan_web_content) chan_r := make(chan []byte) defer close(chan_r) for i := 0; i < 10; i++ { url := fmt.Sprintf(base_url, i*25) go get_web_content(url, chan_web_content) } go func() { for { web_content, ok := <- chan_web_content if !ok { break } go extract_valid_content(web_content, chan_r) } }() flag := false to := time.NewTimer(time.Second * 5) file, err := os.OpenFile(OutputFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) if err != nil { fmt.Println("Failed to open the file", err.Error()) return } defer file.Close() for { if flag { break } to.Reset(time.Second * 5) select { case res := <- chan_r: fmt.Printf("%s\n", res) file.Write(res) file.WriteString("\n") case <- to.C: flag = true break } } fmt.Println("end") }