根據yield
迭代器生成的對象是request對象
仍是item對象
css
在items.py
文件中設置類python
class MyscrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() price = scrapy.Field() prostatus = scrapy.Field()
from myscrapy.items import MyscrapyItem def get_info(self,response): elements_list = response.css('.product') for element in elements_list: title = element.css('.productTitle a::attr(title)').extract_first() #這是css選擇器 price = element.css('.productPrice em::attr(title)').extract_first() prostatus = element.css('.productStatus em::text').extract_first() item = MyscrapyItem() #實例話一個item對象 item['title'] = title #填寫配置的參數 item['price'] = price item['prostatus'] = prostatus yield item
ITEM_PIPELINES = { 'myscrapy.pipelines.MyscrapyPipeline': 300, #小的優先級高 # 'myscrapy.pipelines.MyscrapyPipeline1': 500, } #和中間件一個道理
#其中兩個方法很是經常使用 #def open_spider(self): 運行這個函數開始執行,通常都是鏈接數據庫用 #def close_spider(self): 運行完這個函數執行,通常都是關閉數據庫用 #簡單拿MongoDB舉例 from pymongo import MongoClient class MyscrapyPipeline(object): def __init__(self,HOST,PORT,USER,PWD,DB,TABLE): self.HOST = HOST self.PORT = PORT self.USER = USER self.PWD = PWD self.DB = DB self.TABLE = TABLE #執行__init__以前執行 @classmethod def from_crawler(cls,crawler): HOST = crawler.settings.get('HOST') #crawler.settings能夠直接得到setting文件中的全部名稱 PORT = crawler.settings.get('PORT') USER = crawler.settings.get('USER') PWD = crawler.settings.get('PWD') DB = crawler.settings.get('DB') TABLE = crawler.settings.get('TABLE') return cls(HOST,PORT,USER,PWD,DB,TABLE) def open_spider(self,spider): self.client = MongoClient(host=self.HOST,port=self.PORT,username=self.USER,password=self.PWD) print('鏈接數據庫成功') def close_spider(self,spider): self.client.close() print('關閉數據庫') def process_item(self, item, spider): self.client[self.DB][self.TABLE].insert_one(dict(item)) return item