公司數據庫有近三千臺服務器,用saltstack管理近三千臺服務器,會出現各類問題,監控minion端狀態很是重要,由於這種監控不屬於很是緊急,出了問題就必須馬上處理的故障,因此只須要寫好定時任務,定時執行就能夠了php
代碼邏輯:前端
一、首先從數據庫裏面查出全部數據庫平臺的機器
二、經過通道機api調用,在salt master執行一條命令uptime
三、對返回的結果進行分析,若是存在load關鍵字,則代表該minion的salt狀態良好,反之若是沒有關鍵字,則重啓salt minion,再檢測一次,若是尚未返回,則表示該minion狀態爲bad,入庫node
表:python
show create table salt_mon \G;
*************************** 1. row ***************************
Table: salt_mon
Create Table: CREATE TABLE `salt_mon` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`ip_in` varchar(16) NOT NULL DEFAULT '',
`salt_mon_value` varchar(10) NOT NULL DEFAULT '',
`salt_mon_info` varchar(100) NOT NULL DEFAULT '',
`ctime` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `salt_mon_value` (`salt_mon_value`),
KEY `ip_in` (`ip_in`),
KEY `ctime` (`ctime`)
) ENGINE=InnoDB AUTO_INCREMENT=4244 DEFAULT CHARSET=utf8
1 row in set (0.01 sec)mysql
程序代碼:sql
#!/usr/bin/python26 # -*- coding: utf-8 -*- """ @author:mujibin @modify time: 2015-11-05 """ import os import sys import time import datetime import re import paramiko import socket os.chdir(sys.path[0]) sys.path.append("/data1/salt/mysqlapi/salt/") import urllib import urllib2 import traceback import json import logging import types import commands import MySQLdb from multiprocessing import Pool #from salt_repair import * reload(sys) sys.setdefaultencoding('utf8') c_date = time.strftime("%Y%m%d",time.localtime()) c_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) # 統一入庫時間,很重要,是前端 最新信息的依據 H3306='**com.cn' H3308='***com.cn' P3306=3306 P3308=3308 dp_admin='dp_admin' HOST_PORT='3306' HOST_USER = 'mysqlha' HOST_PASSED = '*******' db='test' def getIp(domain): import socket myaddr = socket.getaddrinfo(domain,'http')[0][4][0] return myaddr MASTERDNS= "********.com.cn" MASTERIP = getIp(MASTERDNS) ###獲取salt-master的ip地址 def sql_select(sql, port=3306, domain='*****', db='salt'):##查詢 port = int(port) array = [] try: db = MySQLdb.connect(host=domain,user=HOST_USER,port=port,passwd=HOST_PASSED,db=db,connect_timeout=3,charset="utf8") cursor = db.cursor() cursor.execute(sql) rows = cursor.fetchall() for row in rows: array.append(row) return array except Exception,e: #print str(e) return array def sql_insert(sql, port=3306, domain='****', db='salt'):##插入 try: db = MySQLdb.connect(host=domain,user=HOST_USER,port=port,passwd=HOST_PASSED,db=db,connect_timeout=3,charset="utf8") cursor = db.cursor() cursor.execute(sql) db.commit() db.close() except Exception,e: #print str(e) db.rollback() db.close() def ssh_connect_bak(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) privatekeyfile = os.path.expanduser('/root/.ssh/id_rsa') mykey = paramiko.RSAKey.from_private_key_file(privatekeyfile) host=host.strip() client.connect(host,26387,username='root',timeout=2,pkey=mykey) return client def ssh_connect(host):##ssh鏈接 client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() client.connect(host,26387,username='root',timeout=2) return client '''經過測試發現經過ssh鏈接到master端執行命令,一旦服務器數量多了,很容易會卡住退不 出來,程序也被卡死,無法繼續跑下去,設置超時時間也不生效,具體緣由沒找到,換了公司 通道機來進行鏈接,沒有再出現這個問題,用下面這個run_cmd函數''' def run_cmd(ips,cmd,method,output,ignore_error,timeout): _ips_ = ips _cmd_ = cmd _method_ = method _output_ = output _ignore_error_ = ignore_error _timeout_ = timeout _user_='litao' _key_='*******' url='https://*****.php' argument={ 'user':_user_,'method':_method_,'output':_output_,'ignore_error':_ignore_error_,'key':_key_,'timeout':_timeout_,'ip':_ips_,'cmd':_cmd_} try: data = urllib.urlencode(argument) response = urllib2.urlopen(url, data) except Exception,e: msg = "Call the api function error!" #logger.debug(msg) #logger.debug(e) #print msg #print e pass return response.read() def ssh_cmd(host,cmd):###經過ssh去執行命令 try: client = ssh_connect(host) i,o,e = client.exec_command(cmd) res = o.read() return res except Exception,e: msg = "The host:%s and cmd:%s execute exception." % (host,cmd) #print msg pass iplist=['10.75.16.197'] # 獲取 node表內全部ip def get_all_id():##獲取全部的ip列表 sql = """ select distinct ip_in from node \ where depid=1 \ and ip_in not in \ (select node.ip_in from node, node2module where node.id=node2module.nid \ and node2module.mname in ('dbstorage') ) """ ids = [] rows = sql_select(sql, port=3306, domain='***.com.cn', db='**') if rows: for row in rows: if row[0] not in ids: if row[0] not in iplist: ids.append(row[0]) return ids def salt_minion_mon(host,p_status,s_status,re_info): ###處理salt-master端執行命令後的結果,並寫入數據庫 try: status = "" salt_mon_value = "" salt_mon_info = "" insertSql = "" res_ping = p_status salt_res = s_status if salt_res == 1: status = "BAD" salt_mon_value = "BAD" salt_mon_info="%s#NULL#NULL#NULL#NULL#%s " %(host,re_info) insertSql = "insert into salt_mon (ip_in, salt_mon_value, salt_mon_info, ctime) values('%s','%s','%s','%s')" %(host,salt_mon_value,salt_mon_info,c_time) #print insertSql sql_insert(insertSql) else: status = "OK" salt_mon_value = "OK" salt_mon_info="%s#NULL#NULL#NULL#NULL#%s " %(host,re_info) insertSql = "insert into salt_mon (ip_in, salt_mon_value, salt_mon_info, ctime) values('%s','%s','%s','%s')" %(host,salt_mon_value,salt_mon_info,c_time) #print insertSql sql_insert(insertSql) except Exception,e: msg = "salt_minion_mon failed!" #print e def ping_mon_by_host(host): ##獲取ping結果 try: ping_cmd = "ping -c 1 -w 2 %s > /dev/null" % host ret = os.system(ping_cmd) if ret == 0: status = 0 msg = "The host %s ping ok" % host else: status = 1 msg = "The host %s ping failed" % host result = {"status":status,"msg":msg} return result except Exception,e: msg = """The host %d: ping_mon_by_host failed!""" % host def check_salt_minion(host):###檢查minion端狀態,經過一個簡單的命令 try: #cmd = "ps -ef|grep salt-minion|grep -v grep" cmd = "salt '%s' -t 7 cmd.run 'uptime'" %host #cmd = "salt '%s' test.ping" %host # cmd = "uptime" #ret = ssh_cmd(MASTERIP,cmd) #ret = ssh_cmd(MASTERIP,cmd) ret = run_cmd(host,cmd,method="sync",output="text",ignore_error="true",timeout=5) msg = "" #if ret and 'load' in ret: if ret and 'load' in ret: status = 0 #msg = 'The host %s salt-minion is running ok.\n' % host msg = 'ok' else : restart_salt_minion(host) time.sleep(3) ret1 = run_cmd(host,cmd,method="sync",output="text",ignore_error="true",timeout=5) if ret1 and 'load' in ret1: status = 0 #msg = 'The host %s salt-minion is running ok.\n' % host msg = 'ok' else: status = 1 #msg = "The host %s salt-minion is running failed.\n" % host try: msg = ret1.split(':')[1].strip()##打印出客戶端的詳細報錯 except Exception,e: msg = ret1 #msg = 'salt-minion failed' result = {'status':status,'message':msg} return result except Exception,e: #traceback.print_exc() pass def restart_salt_minion(host):##重啓minion端 try: cmd = '/etc/init.d/salt-minion restart' #ret = ssh_cmd(host,cmd) ret = run_cmd(host,cmd,method="sync",output="text",ignore_error="true",timeout=5) if ret != None and "Starting salt-minion daemon" in ret and "OK" in ret: status = 0 msg = 'The host %s salt-minion restart successed.\n' % host else : status = 1 msg = "The host %s salt-minion restart failed.\n" % host result = {'status':status,'message':msg} return result except Exception,e: #traceback.print_exc() #print e pass def get_salt_minion_status(host): #print '%s test########################' % host ping_ret = ping_mon_by_host(host) if ping_ret["status"] == 0: salt_ret = check_salt_minion(host) # 若是salt進程不在嘗試重啓一次salt salt_status = salt_ret["status"] salt_info = salt_ret["message"] salt_minion_mon(host,ping_ret["status"],salt_status,salt_info) else: salt_status = 1 salt_info = 'ping failed' #delete_key_by_id(host) #insert_salt_status(host,ping_ret["status"],salt_status) #salt_minion_mon(host,ping_ret["status"],salt_status,salt_info) if __name__ == "__main__": begintime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) begintime = time.strptime(begintime, '%Y-%m-%d %H:%M:%S') begintime = datetime.datetime(*begintime[:6]) all_id=get_all_id() pool = Pool(10)##使用進程池 pool.map(get_salt_minion_status,all_id) pool.close() pool.join() endtime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) endtime = time.strptime(endtime, '%Y-%m-%d %H:%M:%S') endtime = datetime.datetime(*endtime[:6]) time_dvalue = (endtime - begintime).seconds print '總執行時間:%s sec' % (time_dvalue) print '統計的機器數 %s' % (len(all_id))##獲取執行過程的時間
把信息錄入數據庫,而後寫一個前端頁面,去定時獲取數據庫最新的信息就能夠了,以ctime時間爲準
數據庫