100行實現草榴種子爬蟲

原本這個技術含量不足以寫進博客的,不過想一想很久不寫博客都快把markdown語法忘了(汗顏),以前作的信安比賽的項目將來會寫一篇總結。php

代碼比較短,直接就着代碼加註釋講吧:html

import requests
import re
import os
import threading

class Caoliu:
    def __init__(self):
        '''
        初始化定義一個請求頭,後面省的重複定義。同時建立存種子的文件夾
        '''
        self.header_data = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': '',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.t66y.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
        }
        if "torrent_dir" not in os.listdir(os.getcwd()):
            os.makedirs("torrent_dir")

    def download_page(self, url):
        '''
        針對草榴的第三級頁面的方法,負責下載連接
        '''
        header_data2 = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'rmdown.com',
            'Referer': 'http://www.viidii.info/?http://rmdown______com/link______php?' + url.split("?")[1],
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
        }
        try:
            download_text = requests.get(url, headers=header_data2).text
            p_ref = re.compile("name=\"ref\" value=\"(.+?)\"")#點擊下載時會有表單提交,幾個參數都是頁面內hidden屬性的值,把他們先提取出來
            p_reff = re.compile("NAME=\"reff\" value=\"(.+?)\"")
            ref = p_ref.findall(download_text)[0]
            reff = p_reff.findall(download_text)[0]
            r = requests.get("http://www.rmdown.com/download.php?ref="+ref+"&reff="+reff+"&submit=download")
            with open("torrent_dir\\" + ref + ".torrent", "wb") as f:
                f.write(r.content) #下載種子到文件
        except:
            print("download page " + url + " failed")

    def index_page(self, fid=2, offset=1):
        '''
        針對草榴的第一級頁面(瀏覽帖子題目的頁面)
        '''
        p = re.compile("<h3><a href=\"(.+?)\"")
        try:
            tmp_url = "http://www.t66y.com/thread0806.php?fid=" + str(fid) + "&search=&page=" + str(offset)
            r = requests.get(tmp_url)
            for i in p.findall(r.text):
                self.detail_page(i)

        except:
            print("index page " + str(offset) + " get failed")

    def detail_page(self, url):
        '''
        針對具體一個帖子,提取其中的綠色連接(給網盤連接的太不厚道了)
        '''
        p1 = re.compile("(http://rmdown.com/link.php.+?)<")
        p2 = re.compile("(http://www.rmdown.com/link.php.+?)<")
        base_url = "http://www.t66y.com/"
        try:
            r = requests.get(url=base_url + url, headers=self.header_data)
            url_set = set()
            for i in p1.findall(r.text):
                url_set.add(i)
            for i in p2.findall(r.text):
                url_set.add(i)
            url_list = list(url_set)
            for i in url_list:
                self.download_page(i)
        except:
            print("detail page " + url + " get failed")

    def start(self, type, page_start=1, page_end=10, max_thread_num=10):
        '''
        啓動方法,其中type參數負責類型
        下載類型 | type
        -------- | -------
        亞洲無碼 | yazhouwuma
        亞洲有碼 | yazhouyouma
        歐美原創 | oumeiyuanchuang
        動漫原創 | dongmanyuanchuang
        國產原創 | guochanyuanchuang
        中字原創 | zhongziyuanchuang
        page_start,page_end表明起始頁和終止頁 max_thread_num表明容許程序使用的最大線程數
        '''
        if type == "yazhouwuma":
            fid = 2
        elif type == "yazhouyouma":
            fid = 15
        elif type == "oumeiyuanchuang":
            fid = 4
        elif type == "dongmanyuanchuang":
            fid = 5
        elif type == "guochanyuanchuang":
            fid = 25
        elif type == "zhongziyuanchuang":
            fid = 26
        else:
            raise ValueError("type wrong!")
        max_thread_num = min(page_end-page_start+1,max_thread_num)
        thread_list = []
        for i in range(page_start, page_end + 1):
            thread_list.append(threading.Thread(target=self.index_page, args=(fid, i,)))
        for t in thread_list:
            t.start()
            while True:
                if (len(threading.enumerate()) < max_thread_num):
                    break
if __name__ == "__main__":
    c = Caoliu()
    c.start(type="dongmanyuanchuang",page_start=1,page_end=20,max_thread_num=50)

下載後以下:
imagepython

具體的使用方法,github上的readme更詳盡
項目的github地址: https://github.com/chuxiuhong/spider1024git

相關文章
相關標籤/搜索