import pymongo import click # 數據庫基本信息 db_configs = { 'type': 'mongo', 'host': '127.0.0.1', 'port': '27017', "user": "", "password": "", 'db_name': 'spider' } class Mongo(): def __init__(self): self.db_name = db_configs.get("db_name") self.host = db_configs.get("host") self.port = db_configs.get("port") self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10) self.username = db_configs.get("user") self.password = db_configs.get("passwd") if self.username and self.password: self.db = self.client[self.db_name].authenticate(self.username, self.password) self.db = self.client[self.db_name] def reset_status(self, col="dianping_seed_data"): self.db[col].update_many({'$or': [{'status': 1}, {'status': 3}]}, {'$set': {"status": 0}}) def reset_all_status(self, col="dianping_seed_data"): self.db[col].update_many({}, {'$set': {"status": 0}}) def add_index(self, col="dianping_seed_data"): # status_code 0:初始,1:開始下載,2下載完了 self.db[col].create_index([('status', pymongo.ASCENDING)], unique=True) def get_index(self, col="dianping_seed_data"): index_list = self.db[col].list_indexes() for index in index_list: print(index) # 找出重複的放入result表中 def find_duplicate(self, col="dianping_seed_data"): """ {'$out': 'result'}:聚合以後將結果寫到新的集合result表裏。 :param col: :return: """ result = self.db[col].aggregate([ {'$group': { '_id': {'url': "$url"}, '_id_list': {'$addToSet': "$_id"}, ##_id字段添加到返回結果裏面去 'status': {'$addToSet': "$status"}, ##status字段添加到返回結果裏面去 'count': {'$sum': 1} }}, {'$out': 'result'} ], allowDiskUse=True) for item in result: print(item) return result def delete_dup(self, col="dianping_seed_data"): delete_data = self.db.result.find() try: for d in delete_data: # 保留一條 unique_id_list = d.get("_id_list")[1:] for did in unique_id_list: self.db[col].delete_one({'_id': did}) self.db.result.drop() except Exception as e: print("刪除的時候出現問題", e.args) @click.command() @click.option('--s', type=str, default="two", help="狀態:all表示所有重置爲0,two:表示重置狀態爲一、3的重置爲0") @click.option('--i', type=str, default="a", help="a:增長索引 g:獲取索引") @click.option('--d', type=str, default="f", help="d:刪除 f:查詢並生成聚合以後的結果") def run(s, i, d): m = Mongo() if s: print("獲取參數爲:", s) if s == "all": print("全部數據狀態重置爲0:", s) m.reset_all_status() elif s == "two": print("部分數據狀態重置爲0:", s) if i: if i == "a": m.add_index() elif i == "g": m.get_index() if d: if d == "d": m.delete_dup() elif i == "f": m.find_duplicate() if __name__ == '__main__': m = Mongo() m.delete_dup()