zabbix短信報警信息統計以及報表展現

需求html

因爲咱們的業務報警比較頻繁,以前是針對每一個報警進行具體處理,可是有時還會重複出現,或者後續處理有時忘記跟進等,所以進行報警短信的統計,能夠針對一些問題與業務跟進,明確後續的優化方向等。python

實現mysql

實現原理以下圖:
zabbix短信報警信息統計以及報表展現zabbix短信報警信息統計以及報表展現linux

其中核心部分zbx_statis,其實就是我編寫的一個python腳本,它會從zabbixDB中查詢過去一週的全部報警信息,並按不一樣維度統計每週的報表上傳到公司的git上,同時將一條彙總的sql插入到cmdb的庫表中展現。git

報警格式依賴sql

報表的分析統計能夠分兩個維度:app

報警類型緯度;fetch

業務緯度;優化

無論從哪一個維度進行的統計,都須要一個前提:報警格式規範化。unix

針對報警內容的需求,咱們對zabbix的trigger名稱、主機名hostname等進行了規範化。

舉例:

[17][15:31:04][productname-test-mysql-00][PROBLEM][005][cpu idle too low (<30%)][0.10 %][負責人:張學巖][15:31:07]

productname-test-mysql-00 是主機名,按業務等級進行命名,用於報警統計的業務緯度統計;

cpu idle too low (<30%) 是報警的類型,能夠據此項進行類型緯度的統計;

經過維護一個主要業務列表,而後根據hostname匹配能夠從業務緯度進行統計;

經過將報警類型規範化,用固定的格式放在報警信息的固定位置,能夠按類型進行統計。

報表展現

如下是截取的部分報表的展現。

按報警類型緯度:
zabbix短信報警信息統計以及報表展現zabbix短信報警信息統計以及報表展現

最下面的詳細信息跳轉即業務緯度的統計。

按業務緯度:
zabbix短信報警信息統計以及報表展現zabbix短信報警信息統計以及報表展現

CMDB統計圖表:
zabbix短信報警信息統計以及報表展現zabbix短信報警信息統計以及報表展現

很直觀的展現每週的報警數量,若是優化比較好的話,會看到總體應該是降低的趨勢。

附件

上文提到的報警統計 python 腳本,寫的時間比較久了,如今看內容仍是比較雜亂,我也懶得改了,放出來供你們參考,內容以下:

#!/usr/bin/env python26
# encoding: utf-8

import MySQLdb
import traceback
import copy
import datetime
import time
import operator

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

HOST = 'zabbix_db_host'
DB = 'zabbix'
PORT = 3306

RETRY_TIMES = 3

# 業務類型
GROUP_TYPE = ['A',
              'B',
              'C',
              '...',
              'X',
              'Y',
            ]

BASE_DIR = 'alerts_statistic/'

START_TIME = (datetime.datetime.now() - datetime.timedelta(days=(7 + datetime.datetime.now().weekday()))).strftime("%Y%m%d")
END_TIME = (datetime.datetime.now() - datetime.timedelta(days=(datetime.datetime.now().weekday()))).strftime("%Y%m%d")
DAY_SUM = 0
NIGHT_SUM = 0

class Connection:

    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
        self.kwargs['user'] = "user"
        self.kwargs['passwd'] = "password"
        self.kwargs['port'] = kwargs['port'] if kwargs.has_key("port") else 3306
        self.kwargs['db'] = kwargs['db'] if kwargs.has_key("db") else "information_schema"
        self.kwargs['connect_timeout'] = 1

    def get_connection(self):
        ret = {"errno":0, 'errmsg':"", 'value':None}
        conn = None

        try:
            for i in range(0, RETRY_TIMES):
                conn = MySQLdb.connect(*self.args, **self.kwargs)
                if conn:
                    break

            ret['value'] = conn
        except Exception, err:
            ret['error'] = -1
            ret['errmsg'] = self.kwargs['host'] + str(err)
            traceback.print_exc()
        finally:
            return ret

def create_connection(*args, **kwargs):
    __conn__ = Connection(*args, **kwargs)
    ret = __conn__.get_connection()

    if ret['errno']:
        return None
    else:
        return ret['value']

def get_alert():

    start_timestamp = int(time.mktime(datetime.datetime.strptime(START_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple()))
    end_timestamp = int(time.mktime(datetime.datetime.strptime(END_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple()))

    try:
        conn = create_connection(host = HOST, db = DB, port = PORT, charset = 'utf8')

        if conn:
            SQL = """select from_unixtime(a.clock),a.subject
                        from alerts a,events b left join triggers c on b.objectid=c.triggerid
                        where a.eventid=b.eventid and a.alerttype=0 and a.subject not like '%test-%' and a.subject not like '%-test%'
                        and a.clock>={start_time} and a.clock< {end_time} group by a.subject order by a.clock"""
            SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp)
            print SQL
            cursor = conn.cursor()
            cursor.execute(SQL)
            ret = cursor.fetchall()
            cursor.close()
            conn.close()

            return ret
    except Exception,e:
        pass

def alert_statistic(alert_list):
    result = {}
    alerts_list = []
    if alert_list:
        for alert in alert_list:
            alerts_list.append(alert)
        for group in GROUP_TYPE:
            alerts = []
            alerts_list2 = copy.copy(alerts_list)
            for alert in alerts_list2:
                if group in alert[1]:
                    alerts.append(alert)
                    alerts_list.remove(alert)
            result[group] = alerts
        result['other'] = alerts_list
        result['status'] = 0
    else:
        result['status'] = 1
    return result

def write_to_file(result, day):
    if result:
        #statis_date = datetime.datetime.now().strftime("%Y-%m-%d")
        file_name = BASE_DIR + 'detail/' + START_TIME + '-' + END_TIME + '_' + day + '.md'
        writer = open(file_name,'w')
        writer.write('## ' + START_TIME + ' - ' + END_TIME + ': ' + day)
        alert_sum = 0
        for group in result.keys():
            alert_sum = alert_sum + len(result[group])
        writer.write('\n\n**短信總數:' + str(alert_sum) + '**')
        for group in result.keys():
            #print group, ":", len(result[group])
            if group == 'status':
                continue
            if len(result[group]) == 0:
                continue
            writer.write('\n\n### ' + group + '(' + str(len(result[group])) + ')' + '\n\n')
            writer.write('|報警內容|報警時間|\n|---|---|\n')
            for alert in result[group]:
                writer.write('|' + str(alert[1]) + '|' + str(alert[0]) + '|\n')
        writer.close()

def day_night_split(result, day='light'):
    results = {}
    if result:
        for group in result.keys():
            if group == 'status':
                continue
            alerts = result[group]
            alerts2 = copy.copy(alerts)
            for alert in alerts:
                alert_time = alert[0]
                alert_hour_time = alert_time.strftime("%H")
                if int(alert_hour_time) >= 7 and day == 'light':
                    pass
                elif int(alert_hour_time) >= 7 and day == 'night':
                    alerts2.remove(alert)
                elif int(alert_hour_time) < 7 and day == 'light':
                    alerts2.remove(alert)
                elif int(alert_hour_time) < 7 and day == 'night':
                    pass
            results[group] = alerts2
        return results

def alert_groupby(alert_list):
    alerts = []
    alert_group = []
    group_list = []
    for group in alert_list.keys():
        if group == 'status':
            continue
        for alert in alert_list[group]:
            alerts.append(alert[1])
    for alert in alerts:
        ### 兼容添加trigger id的改動
        temp1 = alert.split('][')
        alert_type = ''
        if len(temp1) == 9:
            alert_type = alert.split('][')[5].split(',')[0]
        else:
            alert_type = alert.split('][')[4].split(',')[0]
        if alert_type not in alert_group:
            alert_group.append(alert_type)
    for type in alert_group:
        type_dict = {}
        count = 0
        hostlist = []
        for alert in alerts:
            if type == alert.split('][')[4].split(',')[0] or type == alert.split('][')[5].split(',')[0]:
                count = count + 1
                hostname = alert.split('][')[2]
                if hostname not in hostlist:
                    hostlist.append(hostname)
        type_dict['type'] = type
        type_dict['hostlist'] = ",".join(hostlist)
        type_dict['count'] = str(count)
        group_list.append(type_dict)
    group_list.sort(key=lambda x : int(x['count']), reverse=True)
    return group_list

def write_group(group_light, group_night):
    if group_light and group_night:
        file_name = BASE_DIR + START_TIME + '-' + END_TIME + '.md'
        file_detail_light = START_TIME + '-' + END_TIME + '_白天' + '.md'
        file_detail_night = START_TIME + '-' + END_TIME + '_夜間' + '.md'
        writer = open(file_name,'w')
        writer.write('## ' + START_TIME + '-' + END_TIME + '\n\n')

        ## light
        alert_sum = 0
        for group in group_light:
            alert_sum = alert_sum + int(group['count'])
        global DAY_SUM
        DAY_SUM = alert_sum
        writer.write('### ' + '白天:' + str(alert_sum) + '\n\n')
        writer.write("|報警類型|報警數量|報警主機|\n|---|---|---|\n")
        for group in group_light:
            writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|\n")
        writer.write("\n[詳細報警信息](detail/" + file_detail_light + ")\n\n")

        ## night
        alert_sum = 0
        for group in group_night:
            alert_sum = alert_sum + int(group['count'])
        global NIGHT_SUM
        NIGHT_SUM = alert_sum
        writer.write('### ' + '夜間:' + str(alert_sum) + '\n\n')
        writer.write("|報警類型|報警數量|報警主機|\n|---|---|---|\n")
        for group in group_night:
            writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|\n")
        writer.write("\n[詳細報警信息](detail/" + file_detail_night + ")\n\n")

        writer.close()


def write_trend(sql):
    host = 'cmdb_host'
    db = 'cmdb_db'
    port = 3306
    try:
        conn = create_connection(host = host, db = db, port = port, charset = 'utf8')
        if conn:
            SQL = sql
            # SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp)
            print SQL
            cursor = conn.cursor()
            cursor.execute(SQL)
            ret = cursor.fetchall()
            cursor.close()
            conn.commit()
            conn.close()
            return ret
    except Exception,e:
        print e


def git_push():
    import os
    os.system("cd alerts_statis && git add alerts_statis && git commit -m 'update' && git push")

if __name__ == '__main__':
    # alert 列表
    alert_list = get_alert()
    # 按業務進行統計
    result = alert_statistic(alert_list)
    if result['status'] == 0:
        # 區分白天夜間
        result_day = day_night_split(result, 'light')
        result_night = day_night_split(result, 'night')
        # 按報警類型劃分
        light_alert = alert_groupby(result_day)
        night_alert = alert_groupby(result_night)
        # 寫入文件
        write_group(light_alert, night_alert)
        write_to_file(result_day, '白天')
        write_to_file(result_night, '夜間')
        git_push()
        sql = 'insert into alerts (start_time,end_time,all_count,day_count,night_count)values("' + START_TIME + '","' + END_TIME + '",' + str(DAY_SUM + NIGHT_SUM) + ',' + str(DAY_SUM) + ',' + str(NIGHT_SUM) + ');'
        # 寫入cmdb
        write_trend(sql)
    else:
        print('There\'s no alert warning or something error.')

本文地址:https://www.linuxprobe.com/zabbix-sms-alarm.html

相關文章
相關標籤/搜索