pyspider爬取tourism management 全部文章的標題 做者 摘要 關鍵詞等等全部你想要的信息

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-10-31 13:05:52

import re
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    crawl_config = {
        "headers": {
            "User-Agent": "BaiDu_Spider",
        },
        "timeout":300,
        "connect_timeout":100
    }
    
    def on_start(self):

       self.crawl('http://www.sciencedirect.com/science/journal/02615177',timeout=300,connect_timeout=100,age=0, callback=self.index_page)
        
    @config(fetch_type="js")
    def index_page(self, response):
        for each in response.doc('a').items():
            url=each.attr.href
            #print(url)
            if url!=None:
                if re.match('http://www.sciencedirect.com/science/article/pii/\w+$', url):
                    self.crawl(url,callback=self.detail_page,timeout=300,connect_timeout=100)
        self.crawl(response.doc('#volumeIssueData ul.navigation li a.ActionButton').attr.href, callback=self.index_page,timeout=300,connect_timeout=100)     
   
    
    @config(fetch_type="js")
    def detail_page(self, response):
       #self.index_page(response)      
 

        return {
                "url": response.url,
                "title": response.doc('h1.article-title span').text(),
                "authors": [x.text() for x in response.doc('.author.size-m.workspace-trigger span.content span.text').items()],
                "abstract": response.doc('.Abstracts div div p').text(),
                "keywords": [x.text() for x in response.doc('.keyword span').items()],          #這裏能夠根據你想獲得的信息本身根據 csspath添加
                }
相關文章
相關標籤/搜索