橫向爬取爬頁數,縱向爬取,爬每頁的內容html
package main import ( "fmt" "io" "net/http" "os" "regexp" "strconv" "strings" ) func HttpGet(url string) (result string, err error) { resp, err1 := http.Get(url) if err1 != nil { err = err1 return } defer resp.Body.Close() buf := make([]byte, 4096) for { n, err2 := resp.Body.Read(buf) if n == 0 { break } if err2 != nil && err2 != io.EOF { err = err2 return } result += string(buf[:n]) } return } func SaveJoke2File(idx int, fileTitle, fileContent []string) { path := "第" + strconv.Itoa(idx) + "頁.txt" f, err := os.Create(path) if err != nil { fmt.Println("err:", err) return } defer f.Close() n := len(fileTitle) for i := 0; i < n; i++ { f.WriteString(fileTitle[i] + "\n" + fileContent[i] + "\n") f.WriteString("-----------------------------\n") } } //抓取一個網頁 func Spiderpage(idx int, page chan int) { url := "https://m.pengfue.com/xiaohua_" + strconv.Itoa(idx) + ".html" result, err := HttpGet(url) if err != nil { fmt.Println("httpget err", err) return } ret := regexp.MustCompile(`<h1 class="f18"><a href="(?s:(.*?))"`) alls := ret.FindAllStringSubmatch(result, -1) fileTitle := make([]string, 0) fileContent := make([]string, 0) for _, jokeURL := range alls { // fmt.Println("jokeURL", jokeURL[1]) title, content, err := SpiderJokePage(jokeURL[1]) if err != nil { fmt.Println("err:", err) continue } // fmt.Println("title:", title) // fmt.Println("content:", content) fileTitle = append(fileTitle, title) fileContent = append(fileContent, content) } SaveJoke2File(idx, fileTitle, fileContent) page <- idx } func toWork(start, end int) { fmt.Printf("正在爬取%d到%d頁。。。\n", start, end) page := make(chan int) for i := start; i <= end; i++ { // title, content, err := Spiderpage(i) go Spiderpage(i, page) // if err != nil { // fmt.Println("err:", err) // continue // } // fmt.Println("title:", title) // fmt.Println("content:", content) } for i := start; i <= end; i++ { fmt.Printf("第%d個頁面爬取完成\n", <-page) } } func SpiderJokePage(url string) (title, content string, err error) { result, err1 := HttpGet(url) if err1 != nil { // fmt.Println("httpget err", err) err = err1 return } ret1 := regexp.MustCompile(`<title>(?s:(.*?))</title>`) alls := ret1.FindAllStringSubmatch(result, 1) //兩處,取一個 for _, timTitle := range alls { title = timTitle[1] title = strings.Replace(title, " ", "", -1) title = strings.Replace(title, "\n", "", -1) break } ret2 := regexp.MustCompile(`<div class="con-txt">(?s:(.*?))</div>`) alls2 := ret2.FindAllStringSubmatch(result, 1) //兩處,取一個 for _, timTitle := range alls2 { content = timTitle[1] content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, "\n", "", -1) content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, " ", "", -1) break } return } func main() { var start, end int fmt.Print("請輸入起始頁。。。") fmt.Scan(&start) fmt.Print("請輸入終止頁。。。") fmt.Scan(&end) toWork(start, end) }