設置告警和通知的主要步驟以下:
1、部署Alertmanager
2、配置Prometheus與Alertmanager通訊
3、配置告警
1. prometheus指定rules目錄
2. configmap存儲告警規則
3. configmap掛載到容器rules目錄node
已經修改好的配置文件web
apiVersion: v1 kind: ConfigMap metadata: # 配置文件名稱 name: alertmanager-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: alertmanager.yml: | global: resolve_timeout: 5m # 告警自定義郵件 smtp_smarthost: 'smtp.163.com:25' smtp_from: 'baojingtongzhi@163.com' smtp_auth_username: 'baojingtongzhi@163.com' smtp_auth_password: 'liang123' receivers: - name: default-receiver email_configs: - to: "zhenliang369@163.com" route: group_interval: 1m group_wait: 10s receiver: default-receiver repeat_interval: 1m
apiVersion: apps/v1 kind: Deployment metadata: name: alertmanager namespace: kube-system labels: k8s-app: alertmanager kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile version: v0.14.0 spec: replicas: 1 selector: matchLabels: k8s-app: alertmanager version: v0.14.0 template: metadata: labels: k8s-app: alertmanager version: v0.14.0 annotations: scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical containers: - name: prometheus-alertmanager image: "prom/alertmanager:v0.14.0" imagePullPolicy: "IfNotPresent" args: - --config.file=/etc/config/alertmanager.yml - --storage.path=/data - --web.external-url=/ ports: - containerPort: 9093 readinessProbe: httpGet: path: /#/status port: 9093 initialDelaySeconds: 30 timeoutSeconds: 30 volumeMounts: - name: config-volume mountPath: /etc/config - name: storage-volume mountPath: "/data" subPath: "" resources: limits: cpu: 10m memory: 50Mi requests: cpu: 10m memory: 50Mi - name: prometheus-alertmanager-configmap-reload image: "jimmidyson/configmap-reload:v0.1" imagePullPolicy: "IfNotPresent" args: - --volume-dir=/etc/config - --webhook-url=http://localhost:9093/-/reload volumeMounts: - name: config-volume mountPath: /etc/config readOnly: true resources: limits: cpu: 10m memory: 10Mi requests: cpu: 10m memory: 10Mi volumes: - name: config-volume configMap: name: alertmanager-config - name: storage-volume persistentVolumeClaim: claimName: alertmanager
apiVersion: v1 kind: PersistentVolumeClaim metadata: name: alertmanager namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists spec: # 使用本身的動態PV storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: "2Gi"
apiVersion: v1 kind: PersistentVolumeClaim metadata: name: alertmanager namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists spec: # 使用本身的動態PV storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: "2Gi" [root@Master1 ~]# cat alertmanager-service.yaml apiVersion: v1 kind: Service metadata: name: alertmanager namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile kubernetes.io/name: "Alertmanager" spec: ports: - name: http port: 80 protocol: TCP targetPort: 9093 selector: k8s-app: alertmanager type: "ClusterIP"
一、建立pvc、configmap、deployment、servicevim
kubectl apply -f alertmanager-pvc.yaml kubectl create -f alertmanager-configmap.yaml kubectl apply -f alertmanager-deployment.yaml kubectl apply -f alertmanager-service.yaml
二、查看Pod狀態api
kubectl get pod -n kube-systemapp
NAME READY STATUS RESTARTS AGE
alertmanager-5bb796cb48-fwztv 2/2 Running 0 2m29s
三、查看service狀態
kubectl get svc -n kube-systemide
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager ClusterIP 10.0.0.126 <none> 80/TCP
一、編輯 prometheus-configmap.yaml 配置文件添加綁定信息url
# Prometheus configuration format https://prometheus.io/docs/prometheus/latest/configuration/configuration/ apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: # 存放prometheus配置文件 prometheus.yml: | # 配置採集目標 scrape_configs: - job_name: prometheus static_configs: - targets: # 採集自身 - localhost:9090 prometheus.yml: | # 配置採集目標 scrape_configs: - job_name: kubernetes-nodes static_configs: - targets: # 採集自身 - 192.168.1.110:9100 - 192.168.1.111:9100 # 採集:Apiserver 生存指標 # 建立的job name 名稱爲 kubernetes-apiservers - job_name: kubernetes-apiservers # 基於k8s的服務發現 kubernetes_sd_configs: - role: endpoints # 使用通訊標記標籤 relabel_configs: # 保留正則匹配標籤 - action: keep # 已經包含 regex: default;kubernetes;https source_labels: - __meta_kubernetes_namespace - __meta_kubernetes_service_name - __meta_kubernetes_endpoint_port_name # 使用方法爲https、默認http scheme: https tls_config: # promethus訪問Apiserver使用認證 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # 跳過https認證 insecure_skip_verify: true # promethus訪問Apiserver使用認證 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # 採集:Kubelet 生存指標 - job_name: kubernetes-nodes-kubelet kubernetes_sd_configs: # 發現集羣中全部的Node - role: node relabel_configs: # 經過regex獲取關鍵信息 - action: labelmap regex: __meta_kubernetes_node_label_(.+) scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # 採集:nodes-cadvisor 信息 - job_name: kubernetes-nodes-cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # 重命名標籤 - target_label: __metrics_path__ replacement: /metrics/cadvisor scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # 採集:service-endpoints 信息 - job_name: kubernetes-service-endpoints # 選定指標 kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: true # 指定源標籤 source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape - action: replace regex: (https?) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme # 重命名標籤採集 target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name # 採集:kubernetes-services 服務指標 - job_name: kubernetes-services kubernetes_sd_configs: - role: service # 黑盒探測,探測IP與端口是否可用 metrics_path: /probe params: module: - http_2xx relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_probe - source_labels: - __address__ target_label: __param_target # 使用 blackbox進行黑盒探測 - replacement: blackbox target_label: __address__ - source_labels: - __param_target target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name # 採集: kubernetes-pods 信息 - job_name: kubernetes-pods kubernetes_sd_configs: - role: pod relabel_configs: - action: keep regex: true source_labels: # 只保留採集的信息 - __meta_kubernetes_pod_annotation_prometheus_io_scrape - action: replace regex: (.+) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: # 採集地址 - __address__ # 採集端口 - __meta_kubernetes_pod_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: kubernetes_pod_name alerting: # 告警配置文件 alertmanagers: # 修改:使用靜態綁定 - static_configs: # 修改:targets、指定地址與端口 - targets: ["alertmanager:80"]
二、應用加載配置文件spa
kubectl apply -f prometheus-configmap.yaml
三、web控制檯查看配置是否生效3d
http://192.168.1.110:42575/config
一、編輯 prometheus-configmap.yaml 添加報警信息code
apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: prometheus.yml: | # 添加:指定讀取rules配置 rules_files: - /etc/config/rules/*.rules ......
二、生效配置文件
kubectl apply -f prometheus-configmap.yaml
一、建立yaml文件同過configmap存儲告警規則
vim prometheus-rules.yaml
apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: kube-system data: # 通用角色 general.rules: | groups: - name: general.rules rules: - alert: InstanceDown expr: up == 0 for: 1m labels: severity: error annotations: summary: "Instance {{ $labels.instance }} 中止工做" description: "{{ $labels.instance }} job {{ $labels.job }} 已經中止5分鐘以上." # Node對全部資源的監控 node.rules: | groups: - name: node.rules rules: - alert: NodeFilesystemUsage expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分區使用率太高" description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分區使用大於80% (當前值: {{ $value }})" - alert: NodeMemoryUsage expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} 內存使用率太高" description: "{{ $labels.instance }}內存使用大於80% (當前值: {{ $value }})" - alert: NodeCPUUsage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 60 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} CPU使用率太高" description: "{{ $labels.instance }}CPU使用大於60% (當前值: {{ $value }})"
一、修改掛載點位置,使用以前部署的prometheus動態PV
vim prometheus-statefulset.yaml
apiVersion: apps/v1 kind: StatefulSet metadata: name: prometheus # 部署命名空間 namespace: kube-system labels: k8s-app: prometheus kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile version: v2.2.1 spec: serviceName: "prometheus" replicas: 1 podManagementPolicy: "Parallel" updateStrategy: type: "RollingUpdate" selector: matchLabels: k8s-app: prometheus template: metadata: labels: k8s-app: prometheus annotations: scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical serviceAccountName: prometheus # 初始化容器 initContainers: - name: "init-chown-data" image: "busybox:latest" imagePullPolicy: "IfNotPresent" command: ["chown", "-R", "65534:65534", "/data"] volumeMounts: - name: prometheus-data mountPath: /data subPath: "" containers: - name: prometheus-server-configmap-reload image: "jimmidyson/configmap-reload:v0.1" imagePullPolicy: "IfNotPresent" args: - --volume-dir=/etc/config - --webhook-url=http://localhost:9090/-/reload volumeMounts: - name: config-volume mountPath: /etc/config readOnly: true resources: limits: cpu: 10m memory: 10Mi requests: cpu: 10m memory: 10Mi - name: prometheus-server # 主要使用鏡像 image: "prom/prometheus:v2.2.1" imagePullPolicy: "IfNotPresent" args: - --config.file=/etc/config/prometheus.yml - --storage.tsdb.path=/data - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles - --web.enable-lifecycle ports: - containerPort: 9090 readinessProbe: # 健康檢查 httpGet: path: /-/ready port: 9090 initialDelaySeconds: 30 timeoutSeconds: 30 livenessProbe: httpGet: path: /-/healthy port: 9090 initialDelaySeconds: 30 timeoutSeconds: 30 # based on 10 running nodes with 30 pods each resources: limits: cpu: 200m memory: 1000Mi requests: cpu: 200m memory: 1000Mi # 數據卷 volumeMounts: - name: config-volume mountPath: /etc/config - name: prometheus-data mountPath: /data # 添加:指定rules的configmap配置文件名稱 - name: prometheus-rules mountPath: /etc/config/rules subPath: "" terminationGracePeriodSeconds: 300 volumes: - name: config-volume configMap: name: prometheus-config # 添加:name rules - name: prometheus-rules # 添加:配置文件 configMap: # 添加:定義文件名稱 name: prometheus-rules volumeClaimTemplates: - metadata: name: prometheus-data spec: # 使用動態PV storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: "16Gi"
二、建立configmap並更新PV
kubectl apply -f prometheus-rules.yaml
kubectl apply -f prometheus-statefulset.yaml
三、查看Pod
kubectl get pod -n kube-system
NAME READY STATUS RESTARTS AGE
prometheus-0 1/2 Running 0 42s