需求html
因爲咱們的業務報警比較頻繁,以前是針對每一個報警進行具體處理,可是有時還會重複出現,或者後續處理有時忘記跟進等,所以進行報警短信的統計,能夠針對一些問題與業務跟進,明確後續的優化方向等。python
實現mysql
實現原理以下圖:
linux
其中核心部分zbx_statis,其實就是我編寫的一個python腳本,它會從zabbixDB中查詢過去一週的全部報警信息,並按不一樣維度統計每週的報表上傳到公司的git上,同時將一條彙總的sql插入到cmdb的庫表中展現。git
報警格式依賴sql
報表的分析統計能夠分兩個維度:app
報警類型緯度;fetch
業務緯度;優化
無論從哪一個維度進行的統計,都須要一個前提:報警格式規範化。unix
針對報警內容的需求,咱們對zabbix的trigger名稱、主機名hostname等進行了規範化。
舉例:
[17][15:31:04][productname-test-mysql-00][PROBLEM][005][cpu idle too low (<30%)][0.10 %][負責人:張學巖][15:31:07]
productname-test-mysql-00 是主機名,按業務等級進行命名,用於報警統計的業務緯度統計;
cpu idle too low (<30%) 是報警的類型,能夠據此項進行類型緯度的統計;
經過維護一個主要業務列表,而後根據hostname匹配能夠從業務緯度進行統計;
經過將報警類型規範化,用固定的格式放在報警信息的固定位置,能夠按類型進行統計。
報表展現
如下是截取的部分報表的展現。
按報警類型緯度:
最下面的詳細信息跳轉即業務緯度的統計。
按業務緯度:
CMDB統計圖表:
很直觀的展現每週的報警數量,若是優化比較好的話,會看到總體應該是降低的趨勢。
附件
上文提到的報警統計 python 腳本,寫的時間比較久了,如今看內容仍是比較雜亂,我也懶得改了,放出來供你們參考,內容以下:
#!/usr/bin/env python26 # encoding: utf-8 import MySQLdb import traceback import copy import datetime import time import operator import sys reload(sys) sys.setdefaultencoding( "utf-8" ) HOST = 'zabbix_db_host' DB = 'zabbix' PORT = 3306 RETRY_TIMES = 3 # 業務類型 GROUP_TYPE = ['A', 'B', 'C', '...', 'X', 'Y', ] BASE_DIR = 'alerts_statistic/' START_TIME = (datetime.datetime.now() - datetime.timedelta(days=(7 + datetime.datetime.now().weekday()))).strftime("%Y%m%d") END_TIME = (datetime.datetime.now() - datetime.timedelta(days=(datetime.datetime.now().weekday()))).strftime("%Y%m%d") DAY_SUM = 0 NIGHT_SUM = 0 class Connection: def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs self.kwargs['user'] = "user" self.kwargs['passwd'] = "password" self.kwargs['port'] = kwargs['port'] if kwargs.has_key("port") else 3306 self.kwargs['db'] = kwargs['db'] if kwargs.has_key("db") else "information_schema" self.kwargs['connect_timeout'] = 1 def get_connection(self): ret = {"errno":0, 'errmsg':"", 'value':None} conn = None try: for i in range(0, RETRY_TIMES): conn = MySQLdb.connect(*self.args, **self.kwargs) if conn: break ret['value'] = conn except Exception, err: ret['error'] = -1 ret['errmsg'] = self.kwargs['host'] + str(err) traceback.print_exc() finally: return ret def create_connection(*args, **kwargs): __conn__ = Connection(*args, **kwargs) ret = __conn__.get_connection() if ret['errno']: return None else: return ret['value'] def get_alert(): start_timestamp = int(time.mktime(datetime.datetime.strptime(START_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple())) end_timestamp = int(time.mktime(datetime.datetime.strptime(END_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple())) try: conn = create_connection(host = HOST, db = DB, port = PORT, charset = 'utf8') if conn: SQL = """select from_unixtime(a.clock),a.subject from alerts a,events b left join triggers c on b.objectid=c.triggerid where a.eventid=b.eventid and a.alerttype=0 and a.subject not like '%test-%' and a.subject not like '%-test%' and a.clock>={start_time} and a.clock< {end_time} group by a.subject order by a.clock""" SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp) print SQL cursor = conn.cursor() cursor.execute(SQL) ret = cursor.fetchall() cursor.close() conn.close() return ret except Exception,e: pass def alert_statistic(alert_list): result = {} alerts_list = [] if alert_list: for alert in alert_list: alerts_list.append(alert) for group in GROUP_TYPE: alerts = [] alerts_list2 = copy.copy(alerts_list) for alert in alerts_list2: if group in alert[1]: alerts.append(alert) alerts_list.remove(alert) result[group] = alerts result['other'] = alerts_list result['status'] = 0 else: result['status'] = 1 return result def write_to_file(result, day): if result: #statis_date = datetime.datetime.now().strftime("%Y-%m-%d") file_name = BASE_DIR + 'detail/' + START_TIME + '-' + END_TIME + '_' + day + '.md' writer = open(file_name,'w') writer.write('## ' + START_TIME + ' - ' + END_TIME + ': ' + day) alert_sum = 0 for group in result.keys(): alert_sum = alert_sum + len(result[group]) writer.write('\n\n**短信總數:' + str(alert_sum) + '**') for group in result.keys(): #print group, ":", len(result[group]) if group == 'status': continue if len(result[group]) == 0: continue writer.write('\n\n### ' + group + '(' + str(len(result[group])) + ')' + '\n\n') writer.write('|報警內容|報警時間|\n|---|---|\n') for alert in result[group]: writer.write('|' + str(alert[1]) + '|' + str(alert[0]) + '|\n') writer.close() def day_night_split(result, day='light'): results = {} if result: for group in result.keys(): if group == 'status': continue alerts = result[group] alerts2 = copy.copy(alerts) for alert in alerts: alert_time = alert[0] alert_hour_time = alert_time.strftime("%H") if int(alert_hour_time) >= 7 and day == 'light': pass elif int(alert_hour_time) >= 7 and day == 'night': alerts2.remove(alert) elif int(alert_hour_time) < 7 and day == 'light': alerts2.remove(alert) elif int(alert_hour_time) < 7 and day == 'night': pass results[group] = alerts2 return results def alert_groupby(alert_list): alerts = [] alert_group = [] group_list = [] for group in alert_list.keys(): if group == 'status': continue for alert in alert_list[group]: alerts.append(alert[1]) for alert in alerts: ### 兼容添加trigger id的改動 temp1 = alert.split('][') alert_type = '' if len(temp1) == 9: alert_type = alert.split('][')[5].split(',')[0] else: alert_type = alert.split('][')[4].split(',')[0] if alert_type not in alert_group: alert_group.append(alert_type) for type in alert_group: type_dict = {} count = 0 hostlist = [] for alert in alerts: if type == alert.split('][')[4].split(',')[0] or type == alert.split('][')[5].split(',')[0]: count = count + 1 hostname = alert.split('][')[2] if hostname not in hostlist: hostlist.append(hostname) type_dict['type'] = type type_dict['hostlist'] = ",".join(hostlist) type_dict['count'] = str(count) group_list.append(type_dict) group_list.sort(key=lambda x : int(x['count']), reverse=True) return group_list def write_group(group_light, group_night): if group_light and group_night: file_name = BASE_DIR + START_TIME + '-' + END_TIME + '.md' file_detail_light = START_TIME + '-' + END_TIME + '_白天' + '.md' file_detail_night = START_TIME + '-' + END_TIME + '_夜間' + '.md' writer = open(file_name,'w') writer.write('## ' + START_TIME + '-' + END_TIME + '\n\n') ## light alert_sum = 0 for group in group_light: alert_sum = alert_sum + int(group['count']) global DAY_SUM DAY_SUM = alert_sum writer.write('### ' + '白天:' + str(alert_sum) + '\n\n') writer.write("|報警類型|報警數量|報警主機|\n|---|---|---|\n") for group in group_light: writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|\n") writer.write("\n[詳細報警信息](detail/" + file_detail_light + ")\n\n") ## night alert_sum = 0 for group in group_night: alert_sum = alert_sum + int(group['count']) global NIGHT_SUM NIGHT_SUM = alert_sum writer.write('### ' + '夜間:' + str(alert_sum) + '\n\n') writer.write("|報警類型|報警數量|報警主機|\n|---|---|---|\n") for group in group_night: writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|\n") writer.write("\n[詳細報警信息](detail/" + file_detail_night + ")\n\n") writer.close() def write_trend(sql): host = 'cmdb_host' db = 'cmdb_db' port = 3306 try: conn = create_connection(host = host, db = db, port = port, charset = 'utf8') if conn: SQL = sql # SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp) print SQL cursor = conn.cursor() cursor.execute(SQL) ret = cursor.fetchall() cursor.close() conn.commit() conn.close() return ret except Exception,e: print e def git_push(): import os os.system("cd alerts_statis && git add alerts_statis && git commit -m 'update' && git push") if __name__ == '__main__': # alert 列表 alert_list = get_alert() # 按業務進行統計 result = alert_statistic(alert_list) if result['status'] == 0: # 區分白天夜間 result_day = day_night_split(result, 'light') result_night = day_night_split(result, 'night') # 按報警類型劃分 light_alert = alert_groupby(result_day) night_alert = alert_groupby(result_night) # 寫入文件 write_group(light_alert, night_alert) write_to_file(result_day, '白天') write_to_file(result_night, '夜間') git_push() sql = 'insert into alerts (start_time,end_time,all_count,day_count,night_count)values("' + START_TIME + '","' + END_TIME + '",' + str(DAY_SUM + NIGHT_SUM) + ',' + str(DAY_SUM) + ',' + str(NIGHT_SUM) + ');' # 寫入cmdb write_trend(sql) else: print('There\'s no alert warning or something error.')