上篇文章發了salt-minion的監控代碼 http://6252961.blog.51cto.com/6242961/1710977 ,在監控跑完列出全部的有問題的客戶端列表以後,若是手動一個個去修復,很費事,服務器你太多,因此寫了這個自動修復的代碼,解放雙手php
代碼邏輯:python
一、首先從數據庫讀取minion端有問題的服務器,若是數量超過100,則中止自動修復,沒有則繼續(這個沒有在代碼中實現,不過也很簡單,只須要判斷一下列表長度便可)mysql
二、檢測服務器的ping,若是ping通,繼續,不然保存錯誤信息,中止自動修復sql
三、檢測服務器的ssh登錄狀態,若是能夠登陸並命令‘date’執行成功,繼續,不然保存錯誤信息,中止自動修復數據庫
四、檢查服務器的nfs掛載狀態,若是掛載異常,先卸載nfs,再繼續執行(由於服務器幾千臺,會常常出現服務器掛載的nfs的ip不通的問題,形成yum在執行的過程當中卡死,沒法完成任務也沒法退出任務,具體緣由沒有細究),若是nfs掛載正常,則繼續下一步,若是卸載失敗,則中止修復
json
五、對服務器yum進行修復,就是初始化yum的過程,初始化完以後執行yum list| grep salt若是執行成功,則繼續,不然保存錯誤信息,中止自動修復api
六、卸載服務器原有salt-minion客戶端,卸載以後檢查有沒有卸載成功,若是成功,則繼續,不然保存錯誤信息,中止自動修復服務器
七、從新安裝最新salt-minion客戶端,檢查有沒有安裝成功,若是成功,則繼續,不然保存錯誤信息,中止自動修復app
八、啓動salt-minion客戶端,檢查啓動狀態,若是成功,則繼續,不然保存錯誤信息,中止自動修復dom
九、登錄master端執行簡單命令,確認master與修復後的minion通訊是否成功,若是成功,則修改最新數據庫的對應信息,若是報錯,則把最新信息的對應報錯信息更新
注:
不少地方都是用的公司通道機獲取的json格式的返回數據,函數run_cmd,如:
{"RETURN":"{\"sub_task_id\":\"******\",\"ip\":\"10.75.4.43\",\"user\":\"****\",\"result\":\"10.75.19.1**\\n\"}"}
代碼:
#!/usr/bin/python # -*- coding:utf-8 -*- _author__ = 'mujibin' #import python lib import random import urllib import datetime import time import MySQLdb import os import time import re import urllib2 import json import string import sys import time import paramiko #add path sys.path.append("/data1/salt/mysqlapi/salt/") #import salt repaire function from multiprocessing import * import logging from salt_minion_list import * from init_server import * from check_salt import * #from check_salt_bak import * from salt_repair_ssh import * reload(sys) sys.setdefaultencoding('utf8') H3303='*****.cn' H3304m='******.cn' P3303=3303 P3304=3304 dp_admin='dp_admin' HOST_PORT='3303' HOST_USER = 'mysqlha' HOST_PASSED = '********' db='test' port='*******' c_date = time.strftime("%Y%m%d",time.localtime()) c_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) ''' log_path = "/data1/dbatemp/salt/logs" is_path=os.path.exists(log_path) if not is_path: os.makedirs(log_path) log_name = "salt_reparie.log" logger = logging.getLogger() handler = logging.FileHandler(os.path.join(log_path,log_name)) formater = logging.Formatter("%(asctime)s %(levelname)s [%(funcName)s :%(lineno)d] %(message)s") handler.setFormatter(formater) logger.addHandler(handler) logger.setLevel(logging.NOTSET) #logger.setLevel(logging.INFO) #logger.setLevel(logging.DEBUG) #logger.setLevel(logging.ERROR) ''' ########################################################## salt_yes = datetime.date.today() ########################################################## #ssh api argument method = "sync" output = "json" ignore_error = "true" timeout = "28" ########################################################## slat_minion_check_CONSTANT="salt-minion" ########################################################## SALT = "salt" VERSION = "5.3" ########################################################### #master dns transfor to ip ########################################################### def getIp(domain): import socket myaddr = socket.getaddrinfo(domain,'http')[0][4][0] return myaddr MASTERDNS= "******.cn" MASTERIP = getIp(MASTERDNS) ########################################################## def ssh_connect_bak(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) privatekeyfile = os.path.expanduser('/root/.ssh/id_rsa') mykey = paramiko.RSAKey.from_private_key_file(privatekeyfile) host=host.strip() client.connect(host,26387,username='root',timeout=2,pkey=mykey) return client def ssh_connect(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() client.connect(host,26387,username='root',timeout=10) return client def ssh_cmd(host,cmd): try: client = ssh_connect(host) i,o,e = client.exec_command(cmd) res = o.read().strip() return res except Exception,e: msg = "The host:%s and cmd:%s execute exception." % (host,cmd) #print msg pass def ssh_cmd_check(host,cmd1): #用來判斷是否能夠ssh登錄成功 flag=0 #host_info=[host,flag] cmds=[cmd1] try: for cmd in cmds: #paramiko.util.log_to_file('paramiko.log') s = paramiko.SSHClient() s.load_system_host_keys() s.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() s.connect(host,26387,username='root',timeout=20) s0,s1,s2 = s.exec_command(cmd1) info = s1.read().strip() #print s2.read().strip() #host_info.append(info) s.close() flag=0 except Exception,e: #根據第二個標誌位肯定ssh是否通 flag=1 #host_info[1]=-1 return flag def run_cmd(ips,cmd,method,output,ignore_error,timeout):##這是公司的通道機,能夠獲取json格式的返回數據 _ips_ = ips _cmd_ = cmd #logger.debug(_cmd_) _method_ = method _output_ = output _ignore_error_ = ignore_error _timeout_ = timeout _user_='***' _key_='*****' url='*****p.php' argument={ 'user':_user_,'method':_method_,'output':_output_,'ignore_error':_ignore_error_,'key':_key_,'timeout':_timeout_,'ip':_ips_,'cmd':_cmd_} try: data = urllib.urlencode(argument) response = urllib2.urlopen(url, data) except Exception,e: msg = "Call the api function error!" pass return response.read() def select_in_3303(sql,host,user,port,passwd,db): ##查詢sql try: db = MySQLdb.connect(host=host,user=user,port=port,passwd=passwd,db=db,connect_timeout=5,charset = "utf8") cursor = db.cursor() cursor.execute(sql) rows = cursor.fetchall() array = [] for row in rows: array.append(str(row[0])) db.close() return array except Exception,e: ##print str(e) return [] ##插入sql,由於尚未寫入庫的函數,因此沒有用 def sql_insert(sql, port=3304, domain='*****', db='*****'): try: db = MySQLdb.connect(host=domain,user=HOST_USER,port=port,passwd=HOST_PASSED,db='swordfish',connect_timeout=3,charset="utf8") cursor = db.cursor() cursor.execute(sql) db.commit() db.close() except Exception,e: #print str(e) db.rollback() db.close() ##獲取全部客戶端有問題的服務器ip列表 def fix_list_salt(): sshList=[] try: saltsql="select ip_in from salt_mon where salt_mon_value != 'ok' and salt_mon_info not like '%None%' and ctime = (select ctime from salt_mon order by ctime desc limit 1);" sshList=select_in_3303(sql=saltsql,host=H3304m,user=HOST_USER,port=P3304,passwd=HOST_PASSED,db='swordfish') return sshList except Exception,e: print e #判斷salt是否安裝,經過命令去判斷,若是系統無salt,那麼改命名則什麼都不會 #返回;若是存在,則會返回該系統salt的版本。返回:0 表示系統上存在salt,但進程不必定起來;返回1 #表示,但願不存在salt。 def salt_exist_check(host): try: versionCmd = "rpm -qa | grep salt | wc -l" #versionRes = run_cmd(host, versionCmd, method="sync",output="text",ignore_error="true",timeout=5) versionRes = ssh_cmd(host,versionCmd) #logger.info(host+":"+versionRes) if int(versionRes) == 0: status = 0 else: status = 1 res = status return res except Exception,e: msg = "The function salt_exist_check execute failed with host:%s" % host #logger.error(msg) #logger.error(msg) #該函數嘗試restart salt minion 客戶端,在重啓minion客戶端以前,首先經過接口去master上 #刪除該系統id的key,而後再將本地的key刪除,最後重啓。重啓後經過判斷salt進程是否存在,以此 #代表salt是否重啓成功。返回0表示重啓salt成功,返回1表示重啓失敗。 def salt_minion_restart(host): """ when salt minion installed, which will be restart. This function remove the key of minion. """ try: #logger.info("%s Try to restart the salt minion,this action can't guarante for success!" % host) #salt_remove_key(host) Cmd1 = """sudo rm -f /etc/salt/pki/minion/minion_master.pub""" Cmd2 = """sudo /etc/init.d/salt-minion restart""" #logger.info(host+" : "+rmKeyCmd) #logger.info(host+" : "+startCmd) rmRes1 = run_cmd(host, Cmd1, method="sync",output="text",ignore_error="true",timeout=10) time.sleep(5) rmRes2=run_cmd(host, Cmd2, method="sync",output="text",ignore_error="true",timeout=10) #logger.info(host+" : "+rmRes) #logger.info(host+" : "+startRes) time.sleep(5) saltExistStatus = salt_check(host) if saltExistStatus == 0: msg = 0 else: msg = 1 res = msg return res except Exception,e: msg = "The host:%s restart minion failed!" %(host) #logger.error(msg) #logger.error(e) #該函數會自動刪除系統安裝的salt程序,包括salt與salt minion。若是返回0,表示刪除成功;若是返回 #1,表示刪除失敗。 def remove_salt_minion(host): try: #logger.info("%s Try to remove salt minion!" % host) versionCmd = "sudo rpm -qa | grep salt| grep -v grep" versionRes = run_cmd(host, versionCmd, method="sync",output="json",ignore_error="true",timeout=10) #versionRes = ssh_cmd(host,versionCmd) verResJsion = json.loads(versionRes) saltList = json.loads(verResJsion["RETURN"])['result'].split('\n') ssh_cmd(host,'/etc/init.d/salt-minion stop > /dev/null 2>&1 ') if len(saltList) > 1: for one in range(len(saltList)-1): rmCmd ="sudo yum remove -y %s > /dev/null 2>&1 " % (saltList[one]) #logger.info(host+" : "+rmCmd) rmRes = ssh_cmd(host,rmCmd) time.sleep(4) print rmRes #logger.info(host+" : "+rmRes) else: #logger.info("salt minion don't install!") pass versionStatus = salt_exist_check(host) if versionStatus == 0: status = 0 else: status =1 res = status print 'res:%s' %res return res except Exception,e: msg = "The function remove_salt_minion_qa execute failed with host:%s" % host #logger.info(msg) #logger.info(e) #該函數去判斷系統的yum列表是否存在所需安裝的salt版本。若是存在,則返回0;反之,則返回1。 def yum_check(host): try: #logger.info("%s Try to check yum." % host) checkCmd = "sudo yum list | grep salt | grep 2015 | wc -l" checkRes = ssh_cmd(host,checkCmd) if checkRes != 0: status = 0 else: status = 1 msg = status return msg except Exception,e: msg = "The host:%s check the yum error!" %(host) #logger.error(msg) #logger.error(e) #該函數修復系統的yum源。修復成功,返回0;修復失敗,返回1,就是一個初始化yum源的過程。 def yum_repaire(host): try: yumCmd1=""" ([ `ps -ef | grep yum | grep -v grep | wc -l` -ne 0 ] && sudo ps -ef | grep '/usr/bin/yum' | grep -v grep | awk '{print $2}' | xargs kill -9 || echo '') && (cd /var/lib/rpm/ && sudo rm -f __db.00*) && (sudo rpm --rebuilddb) && (sudo yum clean all) && (sudo chattr -i /etc/yum.conf) && (sudo echo 'include=http://****/conf/yumconf.php' > /etc/yum.conf) && (sudo rm -rf /etc/yum.repos.d/*) && (sudo yum -y remove ****dbp > /dev/null 2>&1) && (sudo yum -y install ****dbp > /dev/null 2>&1) """ ret1 = ssh_cmd(host,yumCmd1) time.sleep(60) if yum_check(host) == 0: msg = 0 else: msg = 1 status = msg return msg except Exception,e: msg = "The host:%s try to repaire yum failed!" %(host) #logger.error(msg) #logger.error(msg) #該函數去判斷系統是否存在salt進程,若是存在則,返回0;反之,則返回1. def salt_check(host): try: #logger.info("%s Check the process of salt." % host) checkCmd = "ps -ef | grep salt-minion | grep -v grep | wc -l" checkRes = ssh_cmd(host,checkCmd) #pattern = re.compile(r".*salt") #match = pattern.match(checkRes) if checkRes != 0: status = 0 else: status = 1 msg = status return msg except Exception,e: msg = "The host:%s salt check error!" %(host) #logger.error(msg) #logger.error(msg) #該函數安裝salt minion客戶端,若是安裝成功,返回0;反之,則返回1. def install_salt_minion(host): try: #logger.info("Install salt minion.") inSaltCmd = """([ `ps -ef | grep yum | grep -v grep | wc -l` -ne 0 ] && sudo ps -ef | grep '/usr/bin/yum' | grep -v grep | awk '{print $2}' | xargs kill -9 || echo '') && (sudo yum clean all) && (sudo yum -y install salt.noarch salt-minion.noarch)""" #in1Res = run_cmd(host, inSaltCmd, method, output, ignore_error, timeout) in1Res = ssh_cmd(host,inSaltCmd) #logger.info(host+" : "+in1Res) #print in1Res time.sleep(20) saltInStatus = salt_exist_check(host) if int(saltInStatus) == 1: status = 0 else: status = 1 res = status return res except Exception,e: msg = "The host:%s install minion failed!" %(host) #logger.debug(msg) #logger.error(e) ##該函數檢測服務器ip是否能ping通 def ping_mon_by_host(host): try: ping_cmd = "ping -c 1 -w 2 %s > /dev/null" % host ret = os.system(ping_cmd) if ret == 0: status = 0 msg = "The host %s ping ok" % host else: status = 1 msg = "The host %s ping failed" % host result = status return result except Exception,e: msg = """The host %d: ping_mon_by_host failed!""" % host #logger.error(msg) #logger.error(e) #檢查master與minion端通訊是否成功 def check_salt_minion(host): try: cmd = "salt '%s' -t 7 cmd.run 'uptime'" %host ret = ssh_cmd(MASTERIP,cmd) msg = "" if ret and 'load' in ret: status = 0 msg = 'ok' else : status = 1 try: msg = ret.split(':')[1].strip() except Exception,e: msg = ret result = {'status':status,'message':msg} return result except Exception,e: pass #該函數檢測nfs掛載狀態,這裏使用公司通道機獲取json格式的返回數據 def nfs_check(host): mount_number_cmd = "mount | grep 'knfs'| wc -l" mount_number = ssh_cmd(host,mount_number_cmd) if int(mount_number) != 0: mount_data_cmd = "mount | grep 'knfs' | awk -F ' ' '{print $3}'" mount_ip_cmd = "mount | grep 'knfs' | awk -F ':' '{print $1}'" try: mount_ip = run_cmd(host, mount_ip_cmd, method="sync",output="json",ignore_error="true",timeout=10) print mount_ip ipJson = json.loads(mount_ip) ipList = json.loads(ipJson['RETURN'])['result'].split('\n') for one in range(len(ipList)-1): ping_Cmd = "ping -c 1 -w 1 %s | grep '0 received' | wc -l" % (ipList[one]) pingRes = ssh_cmd(host,ping_Cmd) if int(pingRes) != 0: umount = run_cmd(host, mount_data_cmd, method="sync",output="json",ignore_error="true",timeout=10) umJson = json.loads(umount) dataList = json.loads(umJson["RETURN"])['result'].split('\n') for one in range(len(dataList)-1): rmCmd ="umount -l %s > /dev/null 2>&1 " % (dataList[one]) rmRes = ssh_cmd(host,rmCmd) time.sleep(2) mount_number_cmd2 = "mount | grep 'knfs'| wc -l" mount_number2 = ssh_cmd(host,mount_number_cmd2) if int(mount_number2) != 0: msg = 1 else: msg = 0 else: msg = 0 except Exception,e: msg = 1 else: msg = 0 return msg #自動修復salt主程序 def salt_repaire(host): try: msg = "" pingStatus = ping_mon_by_host(host) if pingStatus == 0: #判斷是否能夠登陸 sshStatus=ssh_cmd_check(host,'date') if sshStatus == 0: #監測nfs掛載是否正常 nfsStatus = nfs_check(host) if nfsStatus == 0: #修復yum源 print 'yum_repair' yumStatus = yum_repaire(host) #print yumStatus if yumStatus == 0: #卸載salt minion客戶端 print 'remove salt' removeStatus = remove_salt_minion(host) if removeStatus == 0: print 'install salt' #安裝salt minion客戶端 installStatus = install_salt_minion(host) if installStatus == 0: #啓動salt minion 客戶端 print 'start salt' restartStatus = salt_minion_restart(host) if restartStatus == 0: print 'master-minion check' minionStatus = check_salt_minion(host) if minionStatus["status"] == 0: print '%s:ok' % host else: print '%s:%s' %(host,minionStatus["message"]) else: msg = "%s:salt minion restart error!" % host else: msg = "%s:install salt minion failed!" % host else: msg = "%s:remove salt minion failed!" % host else: msg = "%s: yum occur error!" % host else: msg = "%s:nfs err" %host else: msg = "%s: bad ssh,go failed!" % host else: msg = "%s: The host can not ping!" % host print msg #info = msg #re_info = msg #return info #相關信息入庫 #ping_status = p_status #salt_status = s_status #salt_minion_mon(host,ping_status,salt_status,re_info) #salt_info(host,info) return info except Exception,e: msg = "Salt repaire failed with host:%s " % host #logger.info(msg) #logger.info(e) def scheduler_repaire(): minionList = fix_list_salt() pool = Pool(8) pool.map(salt_repaire,minionList) pool.close() pool.join() if __name__ == "__main__": scheduler_repaire()