安裝: cat > docker-compose.yaml <<EOF version: '2' services: prometheus: image: prom/prometheus:latest container_name: prom-server restart: unless-stopped ports: - 9090:9090 volumes: - /etc/localtime:/etc/localtime:ro - /data/prom/server/config:/etc/prometheus - /data/prom/server/data:/prometheus command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.path=/prometheus - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles alertmanager: image: prom/alertmanager:latest container_name: prom-alert restart: unless-stopped ports: - 9093:9093 volumes: - /etc/localtime:/etc/localtime:ro - /data/prom/alertmanager:/etc/alertmanager command: - --config.file=/etc/alertmanager/alertmanager.yml - --storage.path=/alertmanager dingtalk: image: timonwong/prometheus-webhook-dingtalk container_name: prom-dingtalk restart: unless-stopped network_mode: host ports: - 8060:8060 volumes: - /data/prom/alertmanager:/etc/alertmanager command: - --ding.profile=webhook1= - --template.file=/etc/alertmanager/mydd.tmpl grafana: image: grafana/grafana:5.4.2 container_name: grafana restart: unless-stopped ports: - 3000:3000 volumes: - /etc/localtime:/etc/localtime:ro - /data/prom/grafana:/var/lib/grafana:rw env_file: - gf_env EOF cat > gf_env <<EOF GF_SERVER_ROOT_URL=http://localhost:3000 GF_SECURITY_ADMIN_PASSWORD=zxcasd123 GF_USERS_ALLOW_SIGN_UP=false EOF
cat > prometheus.yml <<EOF # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" - 'alert.rules' # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['localhost:9090'] - job_name: 'node' file_sd_configs: - files: - node_targets.yml - job_name: 'federate' honor_labels: true metrics_path: '/federate' params: 'match[]': - '{job=~"kubernetes-.*"}' static_configs: - targets: - labels: instance: k8s-prom # - job_name: 'gluster' # static_configs: # - targets: # - # labels: # instance: gluster-staging1 # - targets: # - # labels: # instance: gluster-staging2 - job_name: 'ping' scrape_interval: 1m metrics_path: /probe params: module: [icmp] file_sd_configs: - files: - ping_targets.yml relabel_configs: - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: - job_name: 'http_get_all' scrape_interval: 30s metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - - - - - - - #- - #- #- #- #- - - - #- - relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: - job_name: 'ceph' honor_labels: true static_configs: - targets: - - - labels: cluster: ceph EOF cat > alert.rules <<EOF groups: - name: node.rules rules: - alert: nodeExporterDown expr: up{job="node"} == 0 for: 10m labels: severity: page annotations: summary: "Prometheus could not scrape a node-exporter" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." - alert: nodeFilesystemUsage expr: (node_filesystem_size_bytes{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} - node_filesystem_free_bytes{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"}) / node_filesystem_size_bytes{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} * 100 > 80 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: High Filesystem usage detected" description: "{{ $labels.instance }}: Filesystem usage {{ $labels.device }} is above 80% ( current value is {{ $value | humanize }}%)" - alert: nodeMemoryUsage expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 85 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: High Memory usage detected" description: "{{ $labels.instance }}: Memory usage is above 85% ( current value is {{ $value | humanize }}%)" - alert: nodeCpuUsage expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])) * 100)) > 85 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: High CPU usage detected" description: "{{ $labels.instance }}: CPU usage is above 85% ( current value is {{ $value | humanize }}%)" - alert: I/O_overloaded expr: (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="iowait"}[1m])) * 100) > 60 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: Disk I/O is overloaded" description: "{{ $labels.instance }}: Disk I/O is overloaded ( current value is {{ $value | humanize }}%)" - name: k8s.rules rules: - alert: k8sJobFailed # expr: kube_job_status_failed{job="kubernetes-service-endpoints",kubernetes_app="kube-state-metrics"}==1 expr: kube_job_status_failed > 0 for: 2m labels: severity: warning annotations: summary: "K8S Job Failed" description: "Job {{ $labels.namespace }}/{{ $labels.job }} failed to complete" - alert: k8sNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"}==0 for: 30m labels: severity: warning annotations: summary: "Node status is NotReady" description: "{{ $labels.node }} has set itself to NotReady, for more than half an hour" - alert: K8SManyNodesNotReady expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 for: 1m labels: severity: critical annotations: description: "{{ $value | humanize }}% of Kubernetes nodes are not ready" - alert: k8sPVCNotUse expr: kube_persistentvolumeclaim_status_phase{phase="Lost"}==1 for: 2m labels: severity: warning annotations: summary: "PVC Not Use" description: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} not use." - alert: k8sPodFaild expr: kube_pod_status_phase{phase=~"Failed|Unknown", pod !~ "jenkins-slave.*"}==1 for: 2m labels: severity: warning annotations: summary: "Pod start Faild" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} boot failure." - alert: PodFrequentlyRestarting expr: increase(kube_pod_container_status_restarts_total{namespace !~ ".*test.*"}[15m]) >= 1 for: 15m labels: severity: warning annotations: description: "Pod {{$labels.namespace}}/{{$labels.pod}} is was restarted {{$value | humanize}} times within the last 15m" summary: "Pod is restarting frequently" - name: net.rules rules: - alert: netDown expr: probe_success == 0 for: 1m labels: severity: warning annotations: summary: "The network impassability" description: "{{ $labels.job }} {{ $labels.instance }} is impassability " - name: ceph.rules rules: - alert: ceph_mon Down expr: ceph_mon_quorum_status == 0 for: 1m labels: severity: warning annotations: summary: "Ceph monitor service down" description: "{{ $labels.ceph_daemon }} is down" - alert: ceph_osd Down expr: ceph_osd_up == 0 for: 1m labels: severity: warning annotations: summary: "Ceph osd service down" description: "{{ $labels.ceph_daemon }} is down" EOF cat > node_targets.yml <<EOF - targets: - labels: instance: k8s1 - targets: - labels: instance: k8s2 - targets: - labels: instance: k8s3 - targets: - labels: instance: k8s4 - targets: - labels: instance: k8s5 - targets: - labels: instance: k8s16 - targets: - labels: instance: k8s17 - targets: - labels: instance: k8s18 - targets: - labels: instance: graylog1 - targets: - labels: instance: graylog2 - targets: - labels: instance: k8s10 - targets: - labels: instance: k8s11 - targets: - labels: instance: k8s12 - targets: - labels: instance: k8s13 - targets: - labels: instance: k8s14 - targets: - labels: instance: devops - targets: - labels: instance: registry - targets: - labels: instance: hknexus #- targets: # - # labels: # instance: ceph1 - targets: - labels: instance: ceph2 - targets: - labels: instance: ceph3 EOF 不一樣的node_explorer 有不一樣的參數,上面的alert.rules是node_explorer >= 0.16的下面的是0.15的: cat > alert.rules-0.15 <<EOF groups: - name: node.rules rules: - alert: nodeExporterDown expr: up{job="node"} == 0 for: 10m labels: severity: page annotations: summary: "Prometheus could not scrape a node-exporter" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." - alert: nodeFilesystemUsage expr: (node_filesystem_size{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} - node_filesystem_free{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"}) / node_filesystem_size{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} * 100 > 85 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: High Filesystem usage detected" description: "{{ $labels.instance }}: Filesystem usage {{ $labels.device }} is above 85% ( current value is {{ $value | humanize }}%)" - alert: nodeMemoryUsage expr: (node_memory_MemTotal - (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / node_memory_MemTotal * 100 > 85 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: High Memory usage detected" description: "{{ $labels.instance }}: Memory usage is above 85% ( current value is {{ $value | humanize }}%)" - alert: nodeCpuUsage expr: (100 - (avg by (instance) (irate(node_cpu{job="node",mode="idle"}[5m])) * 100)) > 85 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: High CPU usage detected" description: "{{ $labels.instance }}: CPU usage is above 85% ( current value is {{ $value | humanize }}%)" - alert: I/O_overloaded expr: (avg by (instance) (irate(node_cpu{job="node",mode="iowait"}[1m])) * 100) > 20 for: 2m labels: severity: warning annotations: summary: "{{ $labels.instance }}: Disk I/O is overloaded" description: "{{ $labels.instance }}: Disk I/O is overloaded ( current value is {{ $value | humanize }}%)" - name: k8s.rules rules: - alert: k8sJobFailed # expr: kube_job_status_failed{job="kubernetes-service-endpoints",kubernetes_app="kube-state-metrics"}==1 expr: kube_job_status_failed > 0 for: 2m labels: severity: warning annotations: summary: "K8S Job Failed" description: "Job {{ $labels.namespace }}/{{ $labels.job }} failed to complete" - alert: k8sNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"}==0 for: 30m labels: severity: warning annotations: summary: "Node status is NotReady" description: "{{ $labels.node }} has set itself to NotReady, for more than half an hour" - alert: K8SManyNodesNotReady expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 for: 1m labels: severity: critical annotations: description: "{{ $value | humanize }}% of Kubernetes nodes are not ready" - alert: k8sPVCNotUse expr: kube_persistentvolumeclaim_status_phase{phase="Lost"}==1 for: 2m labels: severity: warning annotations: summary: "PVC Not Use" description: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} not use." - alert: k8sPodFaild expr: kube_pod_status_phase{phase=~"Failed|Unknown", pod !~ "jenkins-slave.*"}==1 for: 2m labels: severity: warning annotations: summary: "Pod start Faild" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} boot failure." - alert: PodFrequentlyRestarting expr: increase(kube_pod_container_status_restarts_total{namespace !~ ".*test.*"}[15m]) >= 1 for: 15m labels: severity: warning annotations: description: "Pod {{$labels.namespace}}/{{$labels.pod}} is was restarted {{$value | humanize}} times within the last 15m" summary: "Pod is restarting frequently" - name: net.rules rules: - alert: netDown expr: probe_success == 0 for: 1m labels: severity: warning annotations: summary: "The network impassability" description: "{{ $labels.job }} {{ $labels.instance }} is impassability " - name: ceph.rules rules: - alert: ceph_mon Down expr: ceph_mon_quorum_status == 0 for: 1m labels: severity: warning annotations: summary: "Ceph monitor service down" description: "{{ $labels.ceph_daemon }} is down" - alert: ceph_osd Down expr: ceph_osd_up == 0 for: 1m labels: severity: warning annotations: summary: "Ceph osd service down" description: "{{ $labels.ceph_daemon }} is down" EOF cat > ping_targets.yml <<EOF - targets: - labels: instance: k8s1 - targets: - labels: instance: k8s2 - targets: - labels: instance: k8s3 EOF