1.四個配置文件node
[root@kubemaster01 alertmanager]# ls -l -rw-r--r-- 1 root root 676 Oct 28 15:43 alertmanager-configmap.yaml -rw-r--r-- 1 root root 2183 Oct 28 15:36 alertmanager-deployment.yaml -rw-r--r-- 1 root root 331 Oct 28 15:36 alertmanager-pvc.yaml -rw-r--r-- 1 root root 372 Oct 28 15:36 alertmanager-service.yaml
2.修改pv 以及 config的地址api
[root@kubemaster01 alertmanager]# cat alertmanager-pvc.yaml apiVersion: v1 kind: PersistentVolumeClaim metadata: name: alertmanager namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists spec: storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: "2Gi" [root@kubemaster01 alertmanager]# cat alertmanager-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: alertmanager.yml: | global: resolve_timeout: 5m smtp_smarthost: 'smtp.163.com:25' smtp_from: 'ww763004768@163.com' smtp_auth_username: 'ww763004768@163.com' smtp_auth_password: '123456' smtp_require_tls: false receivers: - name: default-receiver email_configs: - to: "w673004768@163.com" route: group_interval: 1m group_wait: 10s receiver: default-receiver repeat_interval: 1m [root@kubemaster01 alertmanager]#
3.部署bash
kubectl apply -f alertmanager-configmap.yaml kubectl apply -f alertmanager-pvc.yaml kubectl apply -f alertmanager-deployment.yaml kubectl apply -f alertmanager-service.yaml
4.Prometheus和alertmanager 通信配置
app
修改prometheus的配置config-map 而後重新運用ide
5.查看是否生效ui
6.修改configmap 修改prometheus的報警規則的spa
(kubectl apply -f prometheus-configmap.yaml)blog
建立configmap
ip
kubectl apply -f prometheus-rules.yaml內存
[root@kubemaster01 prometheus]# cat prometheus-rules.yaml apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: kube-system data: general.rules: | groups: - name: general.rules rules: - alert: InstanceDown expr: up == 0 for: 1m labels: severity: error annotations: summary: "Instance {{ $labels.instance }} 中止工做" description: "{{ $labels.instance }} job {{ $labels.job }} 已經中止5分鐘以上." node.rules: | groups: - name: node.rules rules: - alert: NodeFilesystemUsage expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分區使用率太高" description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分區使用大於80% (當前值: {{ $value }})" - alert: NodeMemoryUsage expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} 內存使用率太高" description: "{{ $labels.instance }}內存使用大於80% (當前值: {{ $value }})" - alert: NodeCPUUsage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 60 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} CPU使用率太高" description: "{{ $labels.instance }}CPU使用大於60% (當前值: {{ $value }})" [root@kubemaster01 prometheus]#
prometheus服務掛載configmap