prometheus 安裝和配置

promtehus:node

安裝:
    cat > docker-compose.yaml <<EOF
version: '2'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prom-server
    restart: unless-stopped
    ports:
      - 9090:9090
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /data/prom/server/config:/etc/prometheus
      - /data/prom/server/data:/prometheus
    command:
      - --config.file=/etc/prometheus/prometheus.yml
      - --storage.tsdb.path=/prometheus
      - --web.console.libraries=/etc/prometheus/console_libraries
      - --web.console.templates=/etc/prometheus/consoles

  alertmanager:
    image: prom/alertmanager:latest
    container_name: prom-alert
    restart: unless-stopped
    ports:
      - 9093:9093
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /data/prom/alertmanager:/etc/alertmanager
    command:
      - --config.file=/etc/alertmanager/alertmanager.yml
      - --storage.path=/alertmanager

  dingtalk:
    image: timonwong/prometheus-webhook-dingtalk
    container_name: prom-dingtalk
    restart: unless-stopped
    network_mode: host
    ports:
      - 8060:8060
    volumes:
      - /data/prom/alertmanager:/etc/alertmanager
    command:
      - --ding.profile=webhook1=https://oapi.dingtalk.com/robot/send?access_token=6f8d6fd8ec8bff71dc5df80f3452d153c439cf0c226c6de37d46dc557b6e64c9
      - --template.file=/etc/alertmanager/mydd.tmpl

  grafana:
    image: grafana/grafana:5.4.2
    container_name: grafana
    restart: unless-stopped
    ports:
      - 3000:3000
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /data/prom/grafana:/var/lib/grafana:rw
    env_file:
      - gf_env
 EOF

  cat > gf_env <<EOF
GF_SERVER_ROOT_URL=http://localhost:3000
GF_SECURITY_ADMIN_PASSWORD=zxcasd123
GF_USERS_ALLOW_SIGN_UP=false
EOF

配置:web

cat > prometheus.yml <<EOF
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - 'alert.rules'

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node'
    file_sd_configs:
    - files:
      - node_targets.yml

  - job_name: 'federate'
    honor_labels: true
    metrics_path: '/federate'
    params:
      'match[]':
        - '{job=~"kubernetes-.*"}'
    static_configs:
      - targets:
        - 172.17.0.8:30000
        labels:
          instance: k8s-prom

#  - job_name: 'gluster'
#    static_configs:
#      - targets:
#        - 172.17.0.27:9189
#        labels:
#          instance: gluster-staging1
#      - targets:
#        - 172.17.0.2:9189
#        labels:
#          instance: gluster-staging2

  - job_name: 'ping'
    scrape_interval: 1m
    metrics_path: /probe
    params:
      module: [icmp]
    file_sd_configs:
    - files:
      - ping_targets.yml
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - target_label: __address__
        replacement: 172.17.32.6:9115

  - job_name: 'http_get_all'
    scrape_interval: 30s
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets:
        - https://ops.umarkcloud.com
        - https://registry.umarkcloud.com:8080
        - https://nexus.umarkcloud.com
        - https://trombone-staging.umarkcloud.com
        - http://staging.umarkcloud.com
        - https://www.umarkcloud.com
        - https://umarkcloud.com
        #- https://trombone-chaoyang.umarkcloud.cn
        - https://chaoyang.umarkcloud.cn
        #- https://trombone-lvdi.umarkcloud.cn
        #- https://lvdi.umarkcloud.cn
        #- http://lvdi-int.umarkcloud.cn
        #- http://trombone-lvdi-int.umarkcloud.cn
        - https://trombone.sunnysct.cn
        - https://console.sunnysct.cn
        - https://piccolo-updates.umarkcloud.com:8081
        #- http://oboe-lvdi-int.umarkcloud.cn
        - http://oboe.sunnysct.cn
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 172.17.32.6:9115

  - job_name: 'ceph'
    honor_labels: true
    static_configs:
      - targets:
        - 172.17.32.2:9283
        - 172.17.32.14:9283
        - 172.17.32.15:9283
        labels:
          cluster: ceph
EOF

cat > alert.rules <<EOF
groups:
- name: node.rules
  rules:
  - alert: nodeExporterDown
    expr: up{job="node"} == 0
    for: 10m
    labels:
      severity: page
    annotations:
      summary: "Prometheus could not scrape a node-exporter"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."

  - alert: nodeFilesystemUsage
    expr: (node_filesystem_size_bytes{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} - node_filesystem_free_bytes{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"}) / node_filesystem_size_bytes{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: High Filesystem usage detected"
      description: "{{ $labels.instance }}: Filesystem usage {{ $labels.device }} is above 80% ( current value is {{ $value | humanize }}%)"

  - alert: nodeMemoryUsage
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 85
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: High Memory usage detected"
      description: "{{ $labels.instance }}: Memory usage is above 85% ( current value is {{ $value | humanize }}%)"

  - alert: nodeCpuUsage
    expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])) * 100)) > 85
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: High CPU usage detected"
      description: "{{ $labels.instance }}: CPU usage is above 85% ( current value is {{ $value | humanize }}%)"

  - alert: I/O_overloaded
    expr: (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="iowait"}[1m])) * 100) > 60
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: Disk I/O is overloaded"
      description: "{{ $labels.instance }}: Disk I/O is overloaded  ( current value is {{ $value | humanize }}%)"

- name: k8s.rules
  rules:
  - alert: k8sJobFailed 
#    expr: kube_job_status_failed{job="kubernetes-service-endpoints",kubernetes_app="kube-state-metrics"}==1
    expr: kube_job_status_failed > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "K8S Job Failed"
      description: "Job {{ $labels.namespace }}/{{ $labels.job }} failed to complete" 

  - alert: k8sNodeNotReady
    expr: kube_node_status_condition{condition="Ready",status="true"}==0
    for: 30m
    labels:
      severity: warning
    annotations:
      summary: "Node status is NotReady"
      description:  "{{ $labels.node }} has set itself to NotReady, for more than half an hour"

  - alert: K8SManyNodesNotReady
    expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $value | humanize }}% of Kubernetes nodes are not ready"

  - alert: k8sPVCNotUse
    expr: kube_persistentvolumeclaim_status_phase{phase="Lost"}==1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "PVC Not Use"
      description: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} not use."

  - alert: k8sPodFaild
    expr: kube_pod_status_phase{phase=~"Failed|Unknown", pod !~ "jenkins-slave.*"}==1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Pod start Faild"
      description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} boot failure."

  - alert: PodFrequentlyRestarting
    expr: increase(kube_pod_container_status_restarts_total{namespace !~ ".*test.*"}[15m]) >= 1
    for: 15m
    labels:
      severity: warning
    annotations:
      description: "Pod {{$labels.namespace}}/{{$labels.pod}} is was restarted {{$value | humanize}} times within the last 15m"
      summary: "Pod is restarting frequently"

- name: net.rules
  rules:
  - alert: netDown
    expr: probe_success == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "The network impassability"
      description: "{{ $labels.job }} {{ $labels.instance }} is impassability "

- name: ceph.rules
  rules:
  - alert: ceph_mon Down
    expr: ceph_mon_quorum_status == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Ceph monitor service down"
      description: "{{ $labels.ceph_daemon }} is down"

  - alert: ceph_osd Down
    expr: ceph_osd_up == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Ceph osd service down"
      description: "{{ $labels.ceph_daemon }} is down"
EOF

cat > node_targets.yml <<EOF
- targets: 
  - 172.17.0.8:9100
  labels:
    instance: k8s1
- targets: 
  - 172.17.0.13:9100
  labels:
    instance: k8s2
- targets: 
  - 172.17.0.6:9100
  labels:
    instance: k8s3
- targets: 
  - 172.17.0.12:9100
  labels:
    instance: k8s4
- targets: 
  - 172.17.0.14:9100
  labels:
    instance: k8s5
- targets: 
  - 172.17.0.49:9100
  labels:
    instance: k8s16
- targets: 
  - 172.17.0.20:9100
  labels:
    instance: k8s17
- targets: 
  - 172.17.0.31:9100
  labels:
    instance: k8s18
- targets: 
  - 172.17.0.28:9100
  labels:
    instance: graylog1
- targets: 
  - 172.17.0.46:9100
  labels:
    instance: graylog2
- targets: 
  - 172.17.0.16:9100
  labels:
    instance: k8s10
- targets: 
  - 172.17.0.24:9100
  labels:
    instance: k8s11
- targets: 
  - 172.17.0.29:9100
  labels:
    instance: k8s12
- targets: 
  - 172.17.0.17:9100
  labels:
    instance: k8s13
- targets: 
  - 172.17.0.44:9100
  labels:
    instance: k8s14
- targets: 
  - 172.17.0.5:9100
  labels:
    instance: devops
- targets: 
  - 172.17.0.9:9100
  labels:
    instance: registry
- targets: 
  - 119.28.221.165:9100
  labels:
    instance: hknexus
#- targets:
#  - 172.17.32.2:9100
#  labels:
#    instance: ceph1
- targets:
  - 172.17.32.14:9100
  labels:
    instance: ceph2
- targets:
  - 172.17.32.15:9100
  labels:
    instance: ceph3
EOF

不一樣的node_explorer 有不一樣的參數,上面的alert.rules是node_explorer >= 0.16的下面的是0.15的:

cat > alert.rules-0.15 <<EOF
groups:
- name: node.rules
  rules:
  - alert: nodeExporterDown
    expr: up{job="node"} == 0
    for: 10m
    labels:
      severity: page
    annotations:
      summary: "Prometheus could not scrape a node-exporter"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."

  - alert: nodeFilesystemUsage
    expr: (node_filesystem_size{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} - node_filesystem_free{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"}) / node_filesystem_size{device=~"rootfs|/dev/mapper/.*|/dev/vd.*"} * 100 > 85
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: High Filesystem usage detected"
      description: "{{ $labels.instance }}: Filesystem usage {{ $labels.device }} is above 85% ( current value is {{ $value | humanize }}%)"

  - alert: nodeMemoryUsage
    expr: (node_memory_MemTotal - (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / node_memory_MemTotal * 100 > 85
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: High Memory usage detected"
      description: "{{ $labels.instance }}: Memory usage is above 85% ( current value is {{ $value | humanize }}%)"

  - alert: nodeCpuUsage
    expr: (100 - (avg by (instance) (irate(node_cpu{job="node",mode="idle"}[5m])) * 100)) > 85
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: High CPU usage detected"
      description: "{{ $labels.instance }}: CPU usage is above 85% ( current value is {{ $value | humanize }}%)"

  - alert: I/O_overloaded
    expr: (avg by (instance) (irate(node_cpu{job="node",mode="iowait"}[1m])) * 100) > 20
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }}: Disk I/O is overloaded"
      description: "{{ $labels.instance }}: Disk I/O is overloaded  ( current value is {{ $value | humanize }}%)"

- name: k8s.rules
  rules:
  - alert: k8sJobFailed 
#    expr: kube_job_status_failed{job="kubernetes-service-endpoints",kubernetes_app="kube-state-metrics"}==1
    expr: kube_job_status_failed > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "K8S Job Failed"
      description: "Job {{ $labels.namespace }}/{{ $labels.job }} failed to complete" 

  - alert: k8sNodeNotReady
    expr: kube_node_status_condition{condition="Ready",status="true"}==0
    for: 30m
    labels:
      severity: warning
    annotations:
      summary: "Node status is NotReady"
      description:  "{{ $labels.node }} has set itself to NotReady, for more than half an hour"

  - alert: K8SManyNodesNotReady
    expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $value | humanize }}% of Kubernetes nodes are not ready"

  - alert: k8sPVCNotUse
    expr: kube_persistentvolumeclaim_status_phase{phase="Lost"}==1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "PVC Not Use"
      description: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} not use."

  - alert: k8sPodFaild
    expr: kube_pod_status_phase{phase=~"Failed|Unknown", pod !~ "jenkins-slave.*"}==1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Pod start Faild"
      description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} boot failure."

  - alert: PodFrequentlyRestarting
    expr: increase(kube_pod_container_status_restarts_total{namespace !~ ".*test.*"}[15m]) >= 1
    for: 15m
    labels:
      severity: warning
    annotations:
      description: "Pod {{$labels.namespace}}/{{$labels.pod}} is was restarted {{$value | humanize}} times within the last 15m"
      summary: "Pod is restarting frequently"

- name: net.rules
  rules:
  - alert: netDown
    expr: probe_success == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "The network impassability"
      description: "{{ $labels.job }} {{ $labels.instance }} is impassability "

- name: ceph.rules
  rules:
  - alert: ceph_mon Down
    expr: ceph_mon_quorum_status == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Ceph monitor service down"
      description: "{{ $labels.ceph_daemon }} is down"

  - alert: ceph_osd Down
    expr: ceph_osd_up == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Ceph osd service down"
      description: "{{ $labels.ceph_daemon }} is down"

EOF

cat > ping_targets.yml <<EOF
- targets: 
  - 172.17.0.8
  labels:
    instance: k8s1
- targets: 
  - 172.17.0.13
  labels:
    instance: k8s2
- targets: 
  - 172.17.0.6
  labels:
    instance: k8s3

EOF
相關文章
相關標籤/搜索