openshift 容器雲從入門到崩潰之九《容器監控-報警》

容器狀態監控

主要是監控POD的狀態包括重啓、不健康等等這些k8s api 狀態自己會報出來,在配合zabbix報警node

導入zabbix模板關聯上oc master主機express

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>3.2</version>
    <date>2019-02-27T07:33:05Z</date>
    <groups>
        <group>
            <name>Templates</name>
        </group>
    </groups>
    <templates>
        <template>
            <template>OC Pods</template>
            <name>OC Pods</name>
            <description/>
            <groups>
                <group>
                    <name>Templates</name>
                </group>
            </groups>
            <applications>
                <application>
                    <name>restartCount</name>
                </application>
                <application>
                    <name>RunningStatus</name>
                </application>
            </applications>
            <items/>
            <discovery_rules>
                <discovery_rule>
                    <name>OC Pods Discover</name>
                    <type>0</type>
                    <snmp_community/>
                    <snmp_oid/>
                    <key>oc.pod.status[discover,discover]</key>
                    <delay>300</delay>
                    <status>0</status>
                    <allowed_hosts/>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <filter>
                        <evaltype>0</evaltype>
                        <formula/>
                        <conditions/>
                    </filter>
                    <lifetime>7</lifetime>
                    <description/>
                    <item_prototypes>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Get Status</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},get_status]</key>
                            <delay>300</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>RunningStatus</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Restarts</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},restarts]</key>
                            <delay>300</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>restartCount</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Running</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},running]</key>
                            <delay>300</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>RunningStatus</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                    </item_prototypes>
                    <trigger_prototypes>
                        <trigger_prototype>
                            <expression>{OC Pods:oc.pod.status[{#POD_NAME},running].str(Running_true)}=0&#13;
and&#13;
{OC Pods:oc.pod.status[{#POD_NAME},running].str(Pod deleted)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Pod {#POD_NAME} Not Running</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning)}=1</expression>
                            <recovery_mode>1</recovery_mode>
                            <recovery_expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning,#3)}=0</recovery_expression>
                            <name>Pod {#POD_NAME} restarted Warning</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                    </trigger_prototypes>
                    <graph_prototypes/>
                    <host_prototypes/>
                </discovery_rule>
            </discovery_rules>
            <httptests/>
            <macros/>
            <templates/>
            <screens/>
        </template>
    </templates>
</zabbix_export>

zabbix客戶端配置json

修改zabbix_agentd.confapi

Timeout=30
UserParameter=oc.pod.status[*],/data/app/zabbix/etc/oc_pod_monitor.sh $1 $2

oc_pod_monitor.sh內容bash

#!/bin/bash
TOKEN=""
ENDPOINT=""
POD_NAME="`echo "$1" |sed 's/.*=\(.*$\)/\1/'`"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE

#經過pod name得到pod所在的namespace
NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 $POD_NAME |grep -v $POD_NAME`"

#驗證pod是否存在
if [ "$POD_NAME" == "discover" ]; then
  echo
elif [ ! -n "$NAMESPACE" ]; then
  echo "Pod deleted"
  exit 0
fi
##自動發現
case $Monitoring_type in
   discover)
     #獲取全部pod只保留pod name
     curl -k \
       -H "Authorization: Bearer $TOKEN" \
       -H 'Accept: application/json' \
       https://$ENDPOINT/api/v1/pods 2>/dev/null  > $WORKSPACE/all_pods.json

     Pod_Name=(`jq -r '.items | .[] | .metadata | .name' $WORKSPACE/all_pods.json |egrep -v 'build|deploy|debug'`)
     #轉換爲json格式
     printf "{\n"
     printf '\t"data":[\n'
     for ((i=0;i<${#Pod_Name[@]};i++))
     do
        NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 ${Pod_Name[i]} |grep -v ${Pod_Name[i]}`"
        Pod_Name_N=""$NAMESPACE"="${Pod_Name[i]}""
        printf '\t\t{\n'
        num=$(echo $((${#Pod_Name[@]}-1)))
        if [ "$i" == ${num} ];
        then
                printf "\t\t\t\"{#POD_NAME}\":\"${Pod_Name_N}\"}\n"
        else
                printf "\t\t\t\"{#POD_NAME}\":\"${Pod_Name_N}\"},\n"
        fi
     done
     printf "\t]\n"
     printf "}\n"   
     exit 0
  ;;

   get_status)#獲取pod狀態以供全部項目調用
     curl -k \
       -H "Authorization: Bearer $TOKEN" \
       -H 'Accept: application/json' \
       https://${ENDPOINT}/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME/status 2>/dev/null > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
     Pod_NotFound="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status |grep '"code": 404'`"
     if [ -n "$Pod_NotFound" ]; then
       echo "Pod_Status=NotFound"
       exit 0
     else
       echo "Success"
       exit 0 
     fi
   ;;
esac

#獲取pod狀態數據
if [ -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.status" ];then
   Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
else
   echo "" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
   Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
fi

#處理Pod_Status的異常
if [ ! -n "$Pod_Status" ]; then  #處理Pod_Status的爲空的異常
   echo "Running_true Pod_Status=Null"
   exit 0
elif [ -n "`echo "$Pod_Status" |grep '"code": 404'`" ]; then  #處理pod不存在可是all_pods.json還沒更新的異常
   echo "Pod_Status=NotFound"
   exit 0
elif [ "`echo "$Pod_Status" |jq -r '.status |.phase'`" = "Pending" ]; then  #驗證容器是否在Pending狀態
   echo "Pending"
   exit 0
fi

#選擇要獲取的數據
case $Monitoring_type in
   restarts)#監控pod是否重啓過
     #判斷是不是新pod
     if [ ! -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount" ]; then
       echo "Warning New Pod"
       echo "0" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
       exit 0
     fi
    
     ##獲取上次的值
     A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
     if [ ! -n "$B_line_null" ]; then  #處理有兩個restartCount值的pod
       B_line="0"
     else
       B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     fi
     Last_state=`expr $A_line + $B_line`
     ##
     
     ##獲取本次的值
     echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.restartCount' > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
     A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
     if [ ! -n "$B_line_null" ]; then  #處理有兩個restartCount值的pod
       B_line="0"
       else
       B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     fi
     Current_state=`expr $A_line + $B_line`
     ##
 
     #對比本次拿到的restartCount值與上此的restartCount值
     if [ "$Current_state" -gt "$Last_state" ]; then
       Restart_status="Warning restart_count=$Current_state"
     else
       Restart_status="Normal restart_count=$Current_state"
     fi
     echo "$Restart_status"
  ;;

   running)#監控pod的運行狀態和容器的狀態返回字符串
     
     #獲取pod和容器的狀態
     running_status=`echo "$Pod_Status" |jq -r '.status |.phase'`
     Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`"
     if [ ! -n "$Container_status" ]; then
        Container_status="_true"
     else
        Container_status="_false"
     fi
     echo "${running_status}${Container_status}"
  ;;
 
   *)
     echo "Error parameters"
     exit 0
  ;;

esac
exit 0

這樣POD重啓或者新建都會報出來app

集羣NODE節點監控

主要監控node節點的不健康狀態,還有lvm卷容量監控curl

導入zabbix模板關聯上oc master主機flex

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>3.2</version>
    <date>2019-02-27T07:47:32Z</date>
    <groups>
        <group>
            <name>Templates</name>
        </group>
    </groups>
    <templates>
        <template>
            <template>OC Node Status</template>
            <name>OC Node Status</name>
            <description/>
            <groups>
                <group>
                    <name>Templates</name>
                </group>
            </groups>
            <applications>
                <application>
                    <name>oc_node</name>
                </application>
            </applications>
            <items/>
            <discovery_rules>
                <discovery_rule>
                    <name>OC Nodes Discover</name>
                    <type>0</type>
                    <snmp_community/>
                    <snmp_oid/>
                    <key>oc.node.status[discover,discover]</key>
                    <delay>60</delay>
                    <status>0</status>
                    <allowed_hosts/>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <filter>
                        <evaltype>0</evaltype>
                        <formula/>
                        <conditions/>
                    </filter>
                    <lifetime>7</lifetime>
                    <description/>
                    <item_prototypes>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  DiskPressure</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},DiskPressure]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>1</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} Get Status</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},get_status]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications/>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  MemoryPressure</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},MemoryPressure]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>1</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  Ready</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_ready]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} CPU Limits</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,cpu_limits]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} CPU Requests</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,cpu_requests]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} Memory Limits</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,memory_limits]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} Memory Requests</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,memory_requests]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  OutOfDisk</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},OutOfDisk]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>1</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                    </item_prototypes>
                    <trigger_prototypes>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_limits].last()}&gt;150</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} CPU Limits 150%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_requests].last()}&gt;100</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} CPU Requests 100%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>2</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},DiskPressure].str(DiskPressure_False)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} DiskPressure</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>1</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_limits].last()}&gt;150</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} Memory Limits 150%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},MemoryPressure].str(MemoryPressure_False)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} MemoryPressure</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>1</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_requests].last()}&gt;95</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} Memory Requests 95%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>2</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_ready].str(Ready_True)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} Not Ready</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},OutOfDisk].str(OutOfDisk_False)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} OutOfDisk</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>1</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                    </trigger_prototypes>
                    <graph_prototypes/>
                    <host_prototypes/>
                </discovery_rule>
            </discovery_rules>
            <httptests/>
            <macros/>
            <templates/>
            <screens/>
        </template>
    </templates>
</zabbix_export>

zabbix客戶端配置ui

修改zabbix_agentd.confurl

Timeout=30
UserParameter=oc.node.status[*],/data/app/zabbix/etc/oc_node_monitor.sh $1 $2 $3

oc_node_monitor.sh的內容

#!/bin/bash
TOKEN=""
ENDPOINT=""
NODE_NAME="$1"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE

case $Monitoring_type in
   discover)#自動發現節點
     Node_Name=(`curl -k \
                   -H "Authorization: Bearer $TOKEN" \
                   -H 'Accept: application/json' \
                    https://$ENDPOINT/api/v1/nodes 2>/dev/null |jq -r '.items|.[]|.metadata|.name'`)

     printf "{\n"
     printf '\t"data":[\n'
     for ((i=0;i<${#Node_Name[@]};i++))
     do
        printf '\t\t{\n'
        num=$(echo $((${#Node_Name[@]}-1)))
        if [ "$i" == ${num} ];
        then
                printf "\t\t\t\"{#NODE_NAME}\":\"${Node_Name[$i]}\"}\n"
        else
                printf "\t\t\t\"{#NODE_NAME}\":\"${Node_Name[$i]}\"},\n"
        fi
     done
     printf "\t]\n"
     printf "}\n"
     exit 0
;;
   get_status)#獲取node狀態以供全部項目調用
     curl -k \
       -H "Authorization: Bearer $TOKEN" \
       -H 'Accept: application/json' \
       https://${ENDPOINT}/api/v1/nodes/$NODE_NAME 2>/dev/null > $WORKSPACE/${NODE_NAME}.status
     if [ -n "`cat $WORKSPACE/${NODE_NAME}.status |grep '"code": 404'`" ]; then
       echo "Node_Status=NotFound"
       exit 0
     elif [ ! -n "`cat $WORKSPACE/${NODE_NAME}.status`" ]; then
       echo "Node_Status=null"
       exit 0
     else
       echo "Success"
       exit 0
     fi
   ;;
esac 

case $Monitoring_type in
   OutOfDisk)#監控node是否磁盤空間不足
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 1p`"
     if [ "$Node_Status" == "False" ]; then 
       echo "OutOfDisk_False"
     elif [ ! -n "$Node_Status" ]; then
       echo "OutOfDisk_False"
     else
       echo "OutOfDisk_$Node_Status"
     fi
  ;;

   MemoryPressure)#監控node是否磁盤空間不足
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 2p`"
     if [ "$Node_Status" == "False" ]; then
       echo "MemoryPressure_False"
     elif [ ! -n "$Node_Status" ]; then
       echo "MemoryPressure_False"
     else
       echo "MemoryPressure_$Node_Status"
     fi
  ;;
  
   DiskPressure)#監控node是否磁盤壓力太大
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 3p`"
     if [ "$Node_Status" == "False" ]; then
       echo "DiskPressure_False"
     elif [ ! -n "$Node_Status" ]; then
       echo "DiskPressure_False"
     else
       echo "DiskPressure_$Node_Status"
     fi
  ;;

   node_ready)#監控node是否準備好了
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 4p`"
     if [ "$Node_Status" == "True" ]; then
       echo "Ready_True"
     elif [ ! -n "$Node_Status" ]; then
       echo "Ready_True"
     else
       echo "Ready_$Node_Status"
     fi
  ;;

   node_resources)#監控node資源分配狀況
     null="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}'`"
     if [ ! -n "$null" ]; then
        sleep 1
     fi
     if [ "$3" == "cpu_requests" ]; then
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}' |grep -o '[0-9]*'`"
        if [ $data -gt 0 ]; then
          echo $data
        else
          echo 0
        fi  
     elif [ "$3" == "cpu_limits" ]; then 
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $4}' |grep -o '[0-9]*'`"
        if [ $data -gt 0 ]; then
          echo $data
        else
          echo 0
        fi

     elif [ "$3" == "memory_requests" ]; then
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $6}' |grep -o '[0-9]*'`"
        if [ "$data" -gt 0 ]; then
          echo $data
        else
          echo 0
        fi 

     elif [ "$3" == "memory_limits" ]; then
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $8}' |grep -o '[0-9]*'`"
        if [ $data -gt 0 ]; then
          echo $data
        else
          echo 0
        fi 
     fi
  ;;
esac

crontab -e

*/2 * * * * /data/scripts/oc_master_crontab.sh >/dev/null 2>&1

oc_master_crontab.sh內容

node_name=(`oc get node |grep -v "NAME" |awk '{print $1}'`)
for ((i=0;i<${#node_name[*]};i++))
do
oc describe node "${node_name[i]}" |grep -B 1 "Events"  |grep -v "Events"  > /data/tmp/oc_monitor/${node_name[i]}.resources
chmod -R 777 /data/tmp/
done
相關文章
相關標籤/搜索