原本這個技術含量不足以寫進博客的,不過想一想很久不寫博客都快把markdown語法忘了(汗顏),以前作的信安比賽的項目將來會寫一篇總結。php
代碼比較短,直接就着代碼加註釋講吧:html
import requests import re import os import threading class Caoliu: def __init__(self): ''' 初始化定義一個請求頭,後面省的重複定義。同時建立存種子的文件夾 ''' self.header_data = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': '', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.t66y.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', } if "torrent_dir" not in os.listdir(os.getcwd()): os.makedirs("torrent_dir") def download_page(self, url): ''' 針對草榴的第三級頁面的方法,負責下載連接 ''' header_data2 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'rmdown.com', 'Referer': 'http://www.viidii.info/?http://rmdown______com/link______php?' + url.split("?")[1], 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } try: download_text = requests.get(url, headers=header_data2).text p_ref = re.compile("name=\"ref\" value=\"(.+?)\"")#點擊下載時會有表單提交,幾個參數都是頁面內hidden屬性的值,把他們先提取出來 p_reff = re.compile("NAME=\"reff\" value=\"(.+?)\"") ref = p_ref.findall(download_text)[0] reff = p_reff.findall(download_text)[0] r = requests.get("http://www.rmdown.com/download.php?ref="+ref+"&reff="+reff+"&submit=download") with open("torrent_dir\\" + ref + ".torrent", "wb") as f: f.write(r.content) #下載種子到文件 except: print("download page " + url + " failed") def index_page(self, fid=2, offset=1): ''' 針對草榴的第一級頁面(瀏覽帖子題目的頁面) ''' p = re.compile("<h3><a href=\"(.+?)\"") try: tmp_url = "http://www.t66y.com/thread0806.php?fid=" + str(fid) + "&search=&page=" + str(offset) r = requests.get(tmp_url) for i in p.findall(r.text): self.detail_page(i) except: print("index page " + str(offset) + " get failed") def detail_page(self, url): ''' 針對具體一個帖子,提取其中的綠色連接(給網盤連接的太不厚道了) ''' p1 = re.compile("(http://rmdown.com/link.php.+?)<") p2 = re.compile("(http://www.rmdown.com/link.php.+?)<") base_url = "http://www.t66y.com/" try: r = requests.get(url=base_url + url, headers=self.header_data) url_set = set() for i in p1.findall(r.text): url_set.add(i) for i in p2.findall(r.text): url_set.add(i) url_list = list(url_set) for i in url_list: self.download_page(i) except: print("detail page " + url + " get failed") def start(self, type, page_start=1, page_end=10, max_thread_num=10): ''' 啓動方法,其中type參數負責類型 下載類型 | type -------- | ------- 亞洲無碼 | yazhouwuma 亞洲有碼 | yazhouyouma 歐美原創 | oumeiyuanchuang 動漫原創 | dongmanyuanchuang 國產原創 | guochanyuanchuang 中字原創 | zhongziyuanchuang page_start,page_end表明起始頁和終止頁 max_thread_num表明容許程序使用的最大線程數 ''' if type == "yazhouwuma": fid = 2 elif type == "yazhouyouma": fid = 15 elif type == "oumeiyuanchuang": fid = 4 elif type == "dongmanyuanchuang": fid = 5 elif type == "guochanyuanchuang": fid = 25 elif type == "zhongziyuanchuang": fid = 26 else: raise ValueError("type wrong!") max_thread_num = min(page_end-page_start+1,max_thread_num) thread_list = [] for i in range(page_start, page_end + 1): thread_list.append(threading.Thread(target=self.index_page, args=(fid, i,))) for t in thread_list: t.start() while True: if (len(threading.enumerate()) < max_thread_num): break if __name__ == "__main__": c = Caoliu() c.start(type="dongmanyuanchuang",page_start=1,page_end=20,max_thread_num=50)
下載後以下:
python
具體的使用方法,github上的readme更詳盡
項目的github地址: https://github.com/chuxiuhong/spider1024git