首先在settings文件中進行配置:redis
# mongdb MONGODB_URI = 'mongodb://127.0.0.1:27017' MONGODB_SERVER="127.0.0.1" MONGODB_PORT=27017 MONGODB_DB = "abckg" # 庫名 MONGODB_COLLECTION="content" # 集合(表) # redis REDIS_URL = 'redis://127.0.0.1:6379' REDIS_SERVER = "127.0.0.1" REDIS_PORT = 6379 REDIS_DB = 0 # 指定鏈接的庫,0表明索引爲0的庫 MY_REDIS='myspider:urls' #redis數據表中的myspider文件夾下的urls列表
而後就能夠在管道中使用了。mongodb
首先事mongodb數據庫:數據庫
import pymongo import fu.settings as settings class FuPipeline(object): '''mogodb鏈接''' def __init__(self): self.conn = pymongo.MongoClient(settings.MONGODB_URI, settings.MONGODB_PORT) self.db = self.conn[settings.MONGODB_DB] # 選擇數據庫 self.MG_table = self.db[settings.MONGODB_COLLECTION] # 選擇表 self.db.authenticate(name='aaa', password='12345', source='admin') def process_item(self, item, spider): self.MG_table.insert(dict(item)) return item
而後就是redis數據庫:scrapy
import redis import logging import zhu.settings as settings from scrapy.utils.project import get_project_settings from scrapy.exceptions import DropItem class ZhuPipeline(object): '''鏈接redis 數據庫''' def __init__(self): self.redis_table = settings.MY_REDIS # 選擇表 self.redis_db = redis.Redis(host=settings.REDIS_SERVER, port=settings.REDIS_PORT, db=settings.REDIS_DB) # redis數據庫鏈接信息 def process_item(self, item, spider): if self.redis_db.exists(item['url']): '''url已存在''' logging.debug("redis:url is exites!!!!!!!!!!!!!") raise DropItem('%s is exists!!!!!!!!!!!!!!!!' % (item['url'])) else: '''url不存在,寫入到數據庫''' logging.debug("redis:url is not exites!!!!!!!!!!!!!") self.redis_db.lpush(self.redis_table, item['url']) return item