The sixth day of Crawler learning

爬取我愛競賽網的大量數據

首先獲取每一種比賽信息的分類連接php

def get_type_url(url):
   web_data = requests.get(web_url)
   soup = BeautifulSoup(web_data.text, 'lxml')
   types = soup.select("#mn_P1_menu li a")
   for type in types:
       print(type.get_text())
       get_num(type.get("href"))

而後獲取每個分類鏈接中的總頁數git

def get_num(url):
   web_data = requests.get(url)
   soup = BeautifulSoup(web_data.text, 'lxml')
   num = soup.select(".pg span")
   # 部分頁面沒有分頁只有一頁,須要分類一下
   if(num!=[]):
       i = int(num[0].get_text().split(" ")[2])
       for w in range(1, i):
           print("第"+str(w)+"頁")
           urls = url + "index.php?page={}".format(str(w))
           get_message_url(urls)
   else:
       get_message_url(url)

最後獲取每一頁中各個比賽的信息github

def get_message_url(url):
   web_data = requests.get(url)
   soup = BeautifulSoup(web_data.text, 'lxml')
   titles = soup.select(".xld .xs2_tit a")
   views = soup.select("span.chakan")
   post_times = soup.select("div.list_info")
   for title, view, post_time in zip(titles, views, post_times):
       data = {
           "標題": title.get_text(),
           "瀏覽量": view.get_text().strip(),
           "發佈時間": post_time.get_text().strip().split(" ")[0],
           "連接": title.get("href")
      }
       print(data)
相關文章
相關標籤/搜索