#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Created on 2014-10-31 13:05:52 import re from libs.base_handler import * class Handler(BaseHandler): ''' this is a sample handler ''' crawl_config = { "headers": { "User-Agent": "BaiDu_Spider", }, "timeout":300, "connect_timeout":100 } def on_start(self): self.crawl('http://www.sciencedirect.com/science/journal/02615177',timeout=300,connect_timeout=100,age=0, callback=self.index_page) @config(fetch_type="js") def index_page(self, response): for each in response.doc('a').items(): url=each.attr.href #print(url) if url!=None: if re.match('http://www.sciencedirect.com/science/article/pii/\w+$', url): self.crawl(url,callback=self.detail_page,timeout=300,connect_timeout=100) self.crawl(response.doc('#volumeIssueData ul.navigation li a.ActionButton').attr.href, callback=self.index_page,timeout=300,connect_timeout=100) @config(fetch_type="js") def detail_page(self, response): #self.index_page(response) return { "url": response.url, "title": response.doc('h1.article-title span').text(), "authors": [x.text() for x in response.doc('.author.size-m.workspace-trigger span.content span.text').items()], "abstract": response.doc('.Abstracts div div p').text(), "keywords": [x.text() for x in response.doc('.keyword span').items()], #這裏能夠根據你想獲得的信息本身根據 csspath添加 }