關於scrapy中如何區分是接着發起請求仍是開始保存文件

一.區分

根據yield迭代器生成的對象是request對象仍是item對象css

二.item

1.配置tem對象

items.py文件中設置類python

class MyscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    prostatus = scrapy.Field()

2.在爬蟲程序中導入該類寫相應的函數

from myscrapy.items import MyscrapyItem
def get_info(self,response):
    elements_list = response.css('.product')
    for element in elements_list:
        title = element.css('.productTitle a::attr(title)').extract_first() #這是css選擇器
        price = element.css('.productPrice em::attr(title)').extract_first()
        prostatus = element.css('.productStatus em::text').extract_first()
        item = MyscrapyItem()  #實例話一個item對象
        item['title'] = title  #填寫配置的參數
        item['price'] = price
        item['prostatus'] = prostatus
        yield item

三.再得到item參數後scrapy會自動執行pipelines.py文件中內容

1.settings文件進行註冊

ITEM_PIPELINES = {
   'myscrapy.pipelines.MyscrapyPipeline': 300,   #小的優先級高
   # 'myscrapy.pipelines.MyscrapyPipeline1': 500,
}
#和中間件一個道理

2.配置MyscrapyPipeline方法

#其中兩個方法很是經常使用
#def open_spider(self): 運行這個函數開始執行,通常都是鏈接數據庫用
#def close_spider(self): 運行完這個函數執行,通常都是關閉數據庫用

#簡單拿MongoDB舉例
from pymongo import MongoClient

class MyscrapyPipeline(object):

    def __init__(self,HOST,PORT,USER,PWD,DB,TABLE):
        self.HOST = HOST
        self.PORT = PORT
        self.USER = USER
        self.PWD = PWD
        self.DB = DB
        self.TABLE = TABLE
    #執行__init__以前執行
    @classmethod
    def from_crawler(cls,crawler):
        HOST = crawler.settings.get('HOST')  #crawler.settings能夠直接得到setting文件中的全部名稱
        PORT = crawler.settings.get('PORT')
        USER = crawler.settings.get('USER')
        PWD = crawler.settings.get('PWD')
        DB = crawler.settings.get('DB')
        TABLE = crawler.settings.get('TABLE')
        return cls(HOST,PORT,USER,PWD,DB,TABLE)


    def open_spider(self,spider):
        self.client = MongoClient(host=self.HOST,port=self.PORT,username=self.USER,password=self.PWD)
        print('鏈接數據庫成功')

    def close_spider(self,spider):
        self.client.close()
        print('關閉數據庫')


    def process_item(self, item, spider):
        self.client[self.DB][self.TABLE].insert_one(dict(item))
        return item
相關文章
相關標籤/搜索