#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2015-03-22 22:06:55 # Project: pdf_spider import re from pyspider.libs.base_handler import * class Handler(BaseHandler): global Cookie Cookie= {"tsclub_bb90_saltkey":"xozcC32l", "tsclub_bb90_lastvisit":"1428457605", "tsclub_bb90_visitedfid":"326", "tsclub_bb90_ulastactivity":"1428579196%7C0", "tsclub_bb90_auth":"f9f8KcrDaj3q9aY9OxESFgE2Cz%2BArVk0gZ5jv%2BQohyhctLjeopEZrXU%2FEbsF6pk%2B754%2Fsi5DnB0W%2BmsmLwMvtC3xkWLt", "tsclub_bb90_lastcheckfeed":"5470207%7C1428579196", "tsclub_bb90_lip":"122.13.84.73%2C1428579196", "tsclub_bb90_nofavfid":"1", "pgv_pvi":"8694210858", "pgv_info":"ssi=s5025153920", "Hm_lvt_ee0d63d2db0dfbf9e0d399bccbd5fce7":"1428461128,1428578830", "Hm_lpvt_ee0d63d2db0dfbf9e0d399bccbd5fce7":"1428581442", "tsclub_bb90_lastact":"1428581519%09misc.php%09patch", "tjpctrl":"1428583242081", } headers= { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip;deflate;sdch", "Accept-Language":"zh-CN,zh;en-US;q=0.8", "Cache-Control":"no-cache", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.4368.102 Safari/537.36", "Host":"club.topsage.com", "Pragma":"no-cache", "Refer":"http://club.topsage.com", "Connection":"keep-alive", } crawl_config = { "headers" : headers, "timeout" : 1000, "cookies" : Cookie } @every(minutes=24 * 60) def on_start(self): self.crawl('http://club.topsage.com/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if(re.match("http://club\.topsage\.com/forum-.+\.html", each.attr.href, re.U)): self.crawl(each.attr.href, callback=self.forum_page) elif re.match("http://club\.topsage\.com/thread-.+\.html", each.attr.href, re.U): self.crawl(each.attr.href, callback=self.detail_page) @config(age=10 * 24 * 60 * 60, priority=2) def forum_page(self, response): response_url=response.url #print('forum_page >> response url is ' + response_url) for each in response.doc('a[href^="http://club.topsage.com"]').items(): #if each.attr.href!=response.url: #detail page if re.match("http://club\.topsage\.com/thread-.+\.html", each.attr.href, re.U): self.crawl(each.attr.href, callback=self.detail_page) #forum forum page elif re.match("http://club\.topsage\.com/forum-.+\.html", each.attr.href, re.U): self.crawl(each.attr.href, callback=self.forum_page) #next page for each in response.doc('html > body > div > div > div > div > a').items(): self.crawl(each.attr.href, callback=self.forum_page) @config(priority=2) def detail_page(self, response): response_url=response.url print('detail_page >> response url is ' + response_url) for each in response.doc('table tr > td > a').items(): if(self.is_url_matched(each.attr.href)): print('attachment url is ' + each.attr.href) return { "download_url":each.attr.href, "file_name":each.text(), } def is_url_matched(self, url): if(re.match('^(http|ftp|https)://.+\.(zip|rar|tar|pdf|doc|docx|excel|ppt|pptx)$', url, re.U)): return True if(re.match('^http://club\.topsage\.com/forum\.php\?mod=attachment.+', url, re.U)): return True return False