import ( "fmt" "net/http" "os" "regexp" "strconv" "strings" )
func main() { var start, end int fmt.Print("請輸入起始頁(>=1) :") fmt.Scan(&start) fmt.Print("請輸入結束頁(>=起始頁) :") fmt.Scan(&end) //開始工做 DoWork(start, end) }
使用channel實現併發。併發
func DoWork(start, end int) { fmt.Printf("正在爬取頁數範圍是第%d頁到第%d頁。\n", start, end) page := make(chan int) for i := start; i <= end; i++ { // 爬取主頁面 go SpiderPage(i, page) } for i := start; i <= end; i++ { fmt.Printf("第%d頁已經爬取完成。\n", <-page) } }
func SpiderPage(i int, page chan int) { // 明確爬取的url // https://www.xiaohua.com/duanzi?page=1 下一頁+1 url := "https://www.xiaohua.com/duanzi?page=" + strconv.Itoa(i) fmt.Printf("正在爬取第%d個網頁:%s\n", i, url) // 開始爬取網頁的內容 result, err := HttpGet(url) if err != nil { fmt.Println("HttpGet err = ", err) return } // fmt.Println("r = ", result) //取 // <p class="fonts"> // <a href=" 一個段子的url "> // 解釋表達式 re := regexp.MustCompile(`<p class="fonts">(?s:(.*?))">`) if re == nil { fmt.Println("regexp.MustCompile err") return } // 取關鍵信息 joyUrls := re.FindAllStringSubmatch(result, -1) // fmt.Println("joyUrls = ", joyUrls) fileTitle := make([]string, 0) fileContent := make([]string, 0) // 取網址 // 第一個返回下標,第二個返回內容 for _, data := range joyUrls { // 一個笑話的鏈接 url := data[1] url = strings.Replace(url, "<a href=\"", "", -1) url = strings.Replace(url, "\r", "", -1) url = strings.Replace(url, "\n", "", -1) url = strings.Replace(url, " ", "", -1) url = "https://www.xiaohua.com" + url // fmt.Printf("test url = #%v#\n", url) //開始爬取每個笑話,每個段子 title, content, err := SpiderOneJoy(url) if err != nil { fmt.Println("SpiderOneJoy err = ", err) continue } // fmt.Printf("title = #%v#\n", title) // fmt.Printf("content = #%v#\n", content) //追加標題和內容 fileTitle = append(fileTitle, title) fileContent = append(fileContent, content) } // fmt.Println("fileTitle = ", fileTitle) // fmt.Println("fileContent = ", fileContent) // 把內容寫入到文件 StoreJoyToFile(i, fileTitle, fileContent) // 寫內容,寫num page <- i }
使用 Get 方法,獲取網頁body全部內容。app
func HttpGet(url string) (result string, err error) { resp, err1 := http.Get(url) //發送get請求 if err1 != nil { err = err1 return } defer resp.Body.Close() // 讀取網頁內容 buf := make([]byte, 1024*4) for { n, _ := resp.Body.Read(buf) if n == 0 { break } // 累加讀取的內容 result += string(buf[:n]) } return }
//開始爬取每個笑話 func SpiderOneJoy(url string) (title, content string, err error) { // 開始爬取頁面內容 result, err1 := HttpGet(url) if err1 != nil { err = err1 return } //取關鍵信息 // 取做者 // <div class="one-cont-title clearfix"> // <div class="one-cont-time"><span></span></div> // <div class="one-cont-font clearfix"> // <a href="/user/1761"> // <em> // <img id="imgIco" data-default="portrait" class="lazy js_img" alt="" src="https://img.xiaohua.com/User/0/1/1761.jpg" /> // </em> // <i> // 標題 // </i> re1 := regexp.MustCompile(`<div class="one-cont-title clearfix">(?s:(.*?))</i>`) if re1 == nil { err = fmt.Errorf("%s", "regexp.MustCompile re1 err") return } tmpTitle := re1.FindAllStringSubmatch(result, -1) for _, data := range tmpTitle { title = data[1] title = strings.Replace(title, " ", "", -1) //特地給末尾加個 a,用於二次取標題 title = title + "a" re1Tmp := regexp.MustCompile(`<i>(?s:(.*?))a`) if re1Tmp == nil { err = fmt.Errorf("%s", "regexp.MustCompile re1Tmp err") } tmpTitle2 := re1Tmp.FindAllStringSubmatch(title, -1) for _, dataTmp := range tmpTitle2 { title = dataTmp[1] title = strings.Replace(title, "\r\n", "", -1) // fmt.Printf("tmpTitle = #%v#\n", title) break } break } // 取內容 <p class="fonts">內容</p> re2 := regexp.MustCompile(`<p class="fonts">(?s:(.*?))</p>`) tmpContent := re2.FindAllStringSubmatch(result, -1) for _, data := range tmpContent { content = data[1] content = strings.Replace(content, "<br>", "", -1) // fmt.Printf("tmpContent = #%v#\n", data[1]) break } return }
// 把內容寫入到文件 func StoreJoyToFile(i int, fileTitle, fileContent []string) { fileName := strconv.Itoa(i) + ".txt" f, err := os.Create(fileName) if err != nil { fmt.Println("os.Create err = ", err) return } // 最後關閉文件 defer f.Close() // 寫內容 n := len(fileTitle) for i := 0; i < n; i++ { // 寫標題 f.WriteString(strconv.Itoa(i+1) + ")" + fileTitle[i] + "\n") // 寫內容 f.WriteString(fileContent[i] + "\n") f.WriteString("\n===========================================================\n") } }
以上。ide