由於目前服務器規模較小,使用zabbix,nagios 等開源的監控系統的必要性並不高,加上配置維護花費的時間成本,因此決定經過本身的腳本,配合saltstack來處理。
監控原理很簡單,server端負責處理監控信息,agent 端負責收集信息,並統一發送到服務器端。python
腳本目錄
├── weixin.py
├── __init__.py
└── main.pyios
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- import time, socket, threading,json from weixin import senddata,gettoken def tcplink(sock, addr): print 'New Connection from %s:%s...' % addr res={} while True: data = sock.recv(1024) time.sleep(1) if data == 'exit' or not data: break res = data handler(res) sock.close() print 'Connection from %s:%s closed.' % addr return res # 錯誤報告 def report(data): content = '' for d in data: content = content + d + "\n" print content corpid = 'xxxxxxxxxxxx' corpsecret = 'xxxxxxxxxxxxxxxxx' accesstoken = gettoken(corpid, corpsecret) msg = senddata(accesstoken, content) print msg print data # 處理客戶端消息,根據閾值判斷 def handler(res): try: data = json.loads(res) except Exception,e: print e print "Data type wrong." return False m_type = data['type'] # 服務器資源監控 if m_type == 1: # ip ip = data['ip'] # ip name = data['name'] # cpu 利用率 cpu_use = data['cpu_use'] # cpu load (能夠改進經過獲取cpu核數來動態判斷) cpu_load = data['cpu_load'] # 內存 利用率 mem_use = data['mem_use'] # 磁盤利用率 disk_use = data['disk_use'] message = ["ip: %s" % ip, "name: %s" % name] print ip,cpu_use,cpu_load,mem_use,disk_use if cpu_use > 95: message.append("cpu_use: %s" % cpu_use) if cpu_load > 3: message.append("cpu_load: %s" % cpu_load) if mem_use > 85: message.append("mem_use: %s" % mem_use) if disk_use > 75: message.append("disk_use: %s" % disk_use) if message.__len__() > 2: report(message) return True # 服務監控 elif m_type == 2: print "service eyes..." print data message = ["oops some service down!"] if data["status"] == 1: message.append("message: %s" % data) report(message) return True if __name__=="__main__": print "Minitor Service Listening on 9999 port." s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('0.0.0.0', 9999)) s.listen(5) while True: sock, addr = s.accept() t = threading.Thread(target=tcplink, args=(sock, addr)) t.start()
import requests import json import sys def gettoken(corp_id, corp_secret): gettoken_url = 'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid=' + corp_id + '&corpsecret=' + corp_secret try: token_file = requests.get(gettoken_url) except requests.HTTPError as e: print(e.code) print(e.read().decode("utf8")) token_data = token_file.text.decode('utf-8') token_json = json.loads(token_data) token_json.keys() token = token_json['access_token'] return token def senddata(access_token,content): send_url = 'https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=' + access_token send_values = { "touser":"187xxxxxxxx|185xxxxxxxx", "msgtype":"text", "agentid":"17", "text":{ "content":content }, "safe":"0" } send_data = json.dumps(send_values, ensure_ascii=False).encode(encoding='UTF8') response = requests.post(send_url, send_data) msg = response.text return msg default_encoding = 'utf-8' if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding)
# monitor.py #!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import division import socket import psutil import os # 內存 def getMonitor(): # 主機信息 name = socket.getfqdn(socket.gethostname()) ip = socket.gethostbyname(name) # n內存 mem=psutil.virtual_memory() mem_use = int((mem.available/mem.total)*100) # cpu cpuload_1, cpuload_5, cpuload_15 = os.getloadavg() cpu_load = cpuload_5 # cpu_use = psutil.cpu_percent(1) cpu = psutil.cpu_percent(interval=5, percpu=True) cpu_count = psutil.cpu_count() cpu_use_total = 0 for c in cpu: cpu_use_total=cpu_use_total + c cpu_use = cpu_use_total/cpu_count # 磁盤 disk_use = psutil.disk_usage('/').percent data = { "type": 1, "ip": ip, "name": name, "cpu_load": cpu_load, "cpu_use": cpu_use, "mem_use": mem_use, "disk_use": disk_use, } print str(data) return str(data).replace("'", '"') s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # 創建鏈接: s.connect(('server_ip', 9999)) data = getMonitor() try: s.send(data) s.close() except Exception,e: print e s.close()
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import division import socket import os,commands,json # 獲取經過systemclt 工具管理的系統服務狀態 def check_status(service_name): status = os.system('sudo systemctl status ' + service_name+ ' > /dev/null') return status # 要監控的服務列表 service_lists = ['config.service','xxx.service','xxx.service'] def get_status(service_lists): """ type == 1 硬件監控 type == 2 服務監控 type == x xxxxxx """ data = {"type": 2, "status": 0} for service in service_lists: re = check_status(service) if re != 0: data[service] = "down" data["status"] = 1 print str(data) return str(data).replace("'", '"') data = get_status(service_lists) if json.loads(data)["status"] == 1: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # 創建鏈接: s.connect(('server_ip', 9999)) try: s.send(data) s.close() except Exception,e: print e s.close()
客戶端
在saltstack 服務器上定時執行 監控腳本shell
*/5 * * * * salt '*' cmd.script salt://scripts/monitor.py python_shell=true
*/5 * * * * salt '*' cmd.script salt://scripts/monitor_service_status.py python_shell=true
json
服務器
加入系統進程,偵聽tcp端口api