示例一php
#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Created on 2014-10-25 14:31:24 import re import json from libs.pprint import pprint from libs.base_handler import * class Handler(BaseHandler): ''' this is a sample handler ''' crawl_config = { } proxy = "" @every(0, 30) def on_start(self): self.crawl(self.proxy+'http://www.douban.com/group/haixiuzu/discussion', force_update=True, callback=self.index_page) @config(age=10) def index_page(self, response): for each in response.doc('tr > .title > a').items(): self.crawl(self.proxy+each.attr.href, callback=self.detail_page) @config(age=30*24*60*60) def detail_page(self, response): assert response.url != "https://www.douban.com/" return { "url": response.url, "title": response.doc("#content h1").text(), "author": response.doc(".topic-content .from a").text(), "author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href, "imgs": [x.attr.src for x in response.doc('.topic-doc img').items()] } def on_result(self, result): if not result or not result['imgs']: return post_id = re.search("topic/(\d+)", self.response.url).group(1) self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST", data={ "short_name": "database", "secret": "8e5a5be8873ad7e9a59147c3cfd10e73", "posts[0][post_key]": post_id, "posts[0][thread_key]": "haixiuzu", "posts[0][message]": json.dumps(result).encode("base64").replace("\n", "") }, callback=self.post_to_duoshuo) def post_to_duoshuo(self): pass
示例二html
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-08-30 19:11:28 # Project: prieto import re from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): for i in range(10000): self.crawl('data:,step%d' % i, callback=self.gen_url, save=i) @config(priority=0) def gen_url(self, respond): for i in range(respond.save * 700, (respond.save + 1) * 700): self.crawl("http://bbs.fobshanghai.com/viewthread.php?action=printable&tid=%d" % i, callback=self.index_page) @config(priority=1) def index_page(self, respond): # title = response.doc hr_black = u'<hr noshade="noshade" size="2" width="100%" color="#808080"/>' hr_blue = u'<br/><br/><br/><br/><hr noshade="noshade" size="2" width="100%" color="#698cc3"/>' #posts = respond.doc('body').html().split(hr_blue)[0].split(hr_black)[1:] if respond.doc('head').html().startswith('<meta'): return { "tid": respond.url.split('=')[-1], "url": respond.url, "html": 'The specified thread does not exist.', } return { "tid": respond.url.split('=')[-1], "url": respond.url, #"t_author": posts[0].split('\n')[1].split('<b>')[0].strip(), # 用正則更好 "html": respond.doc.html(), #"replies": [i for i in posts[1:]] }