#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-05-19 00:21:31 # Project: v2ex from pyspider.libs.base_handler import * #import re class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items(): self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False) @config(age=10 * 24 * 60 * 60) def tab_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items(): self.crawl(each.attr.href, callback=self.board_page, validate_cert=False) @config(priority=2) def board_page(self, response): #實現自動翻頁功能 for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items(): url = each.attr.href if url.find('#reply')>0: url = url[0:url.find('#')] self.crawl(url, callback=self.detail_page, validate_cert=False) for each in response.doc('a.page_normal').items(): self.crawl(each.attr.href, callback=self.board_page, validate_cert=False) @config(priority=2) def detail_page(self, response): title = response.doc('h1').text() content = response.doc('div.topic_content').html().replace('"', '\\"') tmp = zip(response.doc('a[href^="/member/"]').items(), response.doc('div.reply_content').items()) reply_content = list() for e1, e2 in tmp: reply_content.append((e1.text(), e2.text())) #self.add_question(title, content) #插入數據庫 return { "url": response.url, "title": title, "content": content, "reply_content": reply_content, }
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2015-01-04 10:42:01 # Project: tutorial_douban_movie import re from pyspider.libs.base_handler import * class Handler(BaseHandler): """ This is a sample script for: pyspider 爬蟲教程(一):HTML 和 CSS 選擇器 http://blog.binux.me/2015/01/pyspider-tutorial-level-1-html-and-css-selector/ """ @every(minutes=24 * 60) def on_start(self): self.crawl('http://movie.douban.com/tag/', callback=self.index_page) @config(age=24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if 'tag' in each.attr.href: self.crawl(each.attr.href, callback=self.list_page) @config(age=10*24*60*60, priority=2) def list_page(self, response): for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV>TABLE TR.item>TD>DIV.pl2>A').items(): self.crawl(each.attr.href, priority=9, callback=self.detail_page) # 翻頁 for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.paginator>A').items(): self.crawl(each.attr.href, callback=self.list_page) @config(priority=3) def detail_page(self, response): return { "url": response.url, "title": response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN').text(), "rating": response.doc('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong').text(), "導演": [x.text() for x in response.doc('a[rel="v:directedBy"]').items()], }