#!/bin/bash while [ 1 ] do job_error_no=`kubectl get pod -n weifeng |grep -i "job"|grep -ci error` if [ $job_error_no -gt 0 ];then ps -fe|grep k8s_job_status_monitor|grep -v grep|awk '{print $2}'|xargs kill -9 echo "k8s job running is not stable " >> /tmp/k8s_job_error_no.log fi sleep 60 done
若k8s集羣job狀態出現error , 腳本自動kill 掉本身的 montior進程, 經過阿里雲的雲監控進程監控來觸發報警 html
阿里云云監控 進程監控 文檔 http://www.javashuo.com/article/p-shznrfkw-hz.htmlbash