如今環境是這樣:node
ceph 4臺:web
192.168.100.21 ceph-node1docker
192.168.100.22 ceph-node2json
192.168.100.23 ceph-node3測試
192.168.100.25 ceph-node5this
#已經部署好一個ceph cluster 集羣 四個 osd 三個mon 沒有使用塊存儲全部沒有modgoogle
監控服務端一臺lua
192.168.100.26 Grafana 上面都是以容器部署了 url
Prometheus:
Grafana:
alertmanager:
prometheus-webhook-alert:
cAdvisor:spa
docker-compose 編排以下:
version: "2" networks: monitor: driver: bridge services: prometheus: image: prom/prometheus container_name: prometheu hostname: prometheu restart: always volumes: - /Prometheus/config/prometheus.yml:/etc/prometheus/prometheus.yml - ./config/alertmanager-rule.yml:/etc/prometheus/alertmanager-rule.yml - /etc/localtime:/etc/localtime ports: - "9090:9090" networks: - monitor prometheus-webhook-alert: image: timonwong/prometheus-webhook-dingtalk:v0.3.0 container_name: prometheus-webhook-alertmanagers hostname: webhook-alertmanagers restart: always volumes: - /etc/localtime:/etc/localtime ports: - "8060:8060" entrypoint: /bin/prometheus-webhook-dingtalk --ding.profile="webhook1=https://****#釘釘webhook本身去申請一個" networks: - monitor alertmanager: image: prom/alertmanager container_name: alertmanager hostname: alertmanager restart: always volumes: - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml - /etc/localtime:/etc/localtime ports: - "9093:9093" networks: - monitor grafana: image: grafana/grafana container_name: grafana hostname: grafana restart: always volumes: - /etc/localtime:/etc/localtime - ./grafana-piechart:/var/lib/grafana/plugins/grafana-piechart-panel ports: - "3000:3000" networks: - monitor cadvisor: image: google/cadvisor:latest container_name: cadvisor hostname: cadvisor restart: always volumes: - /:/rootfs:ro - /var/run:/var/run:rw - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /etc/localtime:/etc/localtime ports: - "8080:8080" networks: - monitor
幾處關鍵配置文件以下:
#普羅米修斯配置文件
cat ./config/prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: ["192.168.100.26:9093"] # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "alertmanager-rule.yml" scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' static_configs: - targets: ['192.168.100.26:9090'] - job_name: 'cadvisor-1' static_configs: - targets: ['192.168.100.26:8080'] - job_name: 'node-1' scrape_interval: 4s static_configs: - targets: ['192.168.100.26:9100'] - job_name: 'cadvisor-2' static_configs: - targets: ['192.168.100.25:8080'] - job_name: 'node-2' scrape_interval: 4s static_configs: - targets: ['192.168.100.25:9100'] - job_name: 'ceph' scrape_interval: 4s static_configs: - targets: ['192.168.100.21:9128']
#監控報警組件 壓制 合併 過濾配置文件 並配置webhook地址
cat ./config/alertmanager.yml
global: resolve_timeout: 5m route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'web.hook' receivers: - name: 'web.hook' webhook_configs: - url: 'http://192.168.100.26:8060/dingtalk/webhook1/send' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
#監控報警規則配置文件
cat ./alertmanager-rule.yml
groups: - name: ceph-rule rules: - alert: Ceph OSD Down expr: ceph_osd_down > 0 for: 2m labels: product: Ceph測試集羣 annotations: Warn: "{{$labels.instance}}: 有{{ $value }}OSD,down: {{$labels}}" Description: "{{$labels.instance}}:有{{ $labels.osd }}當前狀態爲{{ $labels.status }}" - alert: 集羣空間使用率 expr: ceph_cluster_used_bytes / ceph_cluster_capacity_bytes * 100 > 80 for: 2m labels: product: Ceph測試集羣 annotations: Warn: "{{$labels.instance}}:集羣空間不足" Description: "{{$labels.instance}}:當前空間使用率爲{{ $value }}"
node-exporter: json模板下載 https://grafana.com/grafana/dashboards/10645
cadvisor: json模板下載: https://grafana.com/grafana/dashboards/3125
ceph cluster: json模板下載: https://grafana.com/grafana/dashboards/917%5D
最後來一張完成 成果圖