官方文檔地址:https://prometheus.io/docs/introduction/overview/javascript
~]# useradd -r -m -d /var/lib/prometheus prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.14.0/prometheus-2.14.0.linux-amd64.tar.gz tar -xf prometheus-2.14.0.linux-amd64.tar.gz -C /usr/local/ cd /usr/local ln -sv prometheus-2.14.0.linux-amd64 prometheus
vim /usr/lib/systemd/system/prometheus.service [Unit] Description=The Prometheus 2 monitoring system and time series database. Documentation=https://prometheus.io After=network.target [Service] EnvironmentFile=-/etc/sysconfig/prometheus User=prometheus ExecStart=/usr/local/prometheus/prometheus \ --storage.tsdb.path=/home/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --web.listen-address=0.0.0.0:9090 \ --web.external-url= $PROM_EXTRA_ARGS Restart=on-failure StartLimitInterval=1 RestartSec=3 [Install] WantedBy=multi-user.target
其餘運行時參數: ./prometheus --helpcss
systemctl daemon-reload systemctl start prometheus.service
iptables -I INPUT -p tcp --dport 9090 -s NETWORK/MASK -j ACCEPT
http://IP:PORT
$ docker run --name prometheus -d -v ./prometheus:/etc/prometheus/ -v ./db/:/prometheus -p 9090:9090 prom/prometheus --config.file=/etc/prometheus/prometheus.yml --web.listen-address="0.0.0.0:9090" --storage.tsdb.path=/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles --storage.tsdb.retention=30d
--config.file=/etc/prometheus/prometheus.yml # 指明主配置文件 --web.listen-address="0.0.0.0:9090" # 指明監聽地址端口 --storage.tsdb.path=/prometheus # 指明數據庫目錄 --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles # 指明console lib 和 tmpl --storage.tsdb.retention=60d # 指明數據保留天數,默認15
Prometheus的主配置⽂件爲prometheus.ymlhtml
它主要由global、rule_files、scrape_configs、alerting、remote_write和remote_read⼏個配置段組成:前端
- global:全局配置段; - rule_files:指定告警規則文件的路徑 - scrape_configs: scrape配置集合,⽤於定義監控的⽬標對象(target)的集合,以及描述如何抓取 (scrape)相關指標數據的配置參數; 一般,每一個scrape配置對應於⼀個單獨的做業(job), ⽽每一個targets可經過靜態配置(static_configs)直接給出定義,也可基於Prometheus⽀持的服務發現機制進 ⾏⾃動配置;
- job_name: 'nodes' static_configs: # 靜態指定,targets中的 host:port/metrics 將會做爲metrics抓取對象 - targets: ['localhost:9100'] - targets: ['172.20.94.1:9100']
- job_name: 'docker_host' file_sd_configs: # 基於文件的服務發現,文件中(yml 和json 格式)定義的host:port/metrics將會成爲抓取對象 - files: - ./sd_files/docker_host.yml refresh_interval: 30s
可由Prometheus使⽤的Alertmanager實例的集合,以及如何同這些Alertmanager交互的配置參數;java
每一個Alertmanager可經過靜態配置(static_configs)直接給出定義, 也可基於Prometheus⽀持的服務發現機制進⾏⾃動配置;node
配置「遠程寫」機制,Prometheus須要將數據保存於外部的存儲系統(例如InfluxDB)時 定義此配置段, 隨後Prometheus將樣本數據經過HTTP協議發送給由URL指定適配器(Adaptor);
remote_read:mysql
配置「遠程讀」機制,Prometheus將接收到的查詢請求交給由URL指定適配器 (Adpater)執⾏, Adapter將請求條件轉換爲遠程存儲服務中的查詢請求,並將獲取的響應數據轉換爲Prometheus可⽤的格式;
監控及告警規則配置文件:*.ymllinux
rule_files: - "test_rules.yml" # 指定配置告警規則的文件路徑
服務發現定義文件:支持yaml 和 json 兩種格式nginx
file_sd_configs: - files: - ./sd_files/http.yml refresh_interval: 30s
global: scrape_interval: 15s #每過15秒抓取一次指標數據 evaluation_interval: 15s#每過15秒執行一次報警規則,也就是說15秒執行一次報警 alerting: alertmanagers: - static_configs: - targets: ["localhost:9093"]# 設置報警信息推送地址 , 通常而言設置的是alertManager的地址 rule_files: - "test_rules.yml" # 指定配置告警規則的文件路徑 scrape_configs: - job_name: 'node'#本身定義的監控的job_name static_configs: # 配置靜態規則,直接指定抓取的ip:port - targets: ['localhost:9100'] - job_name: 'CDG-MS' honor_labels: true metrics_path: '/prometheus' static_configs: - targets: ['localhost:8089'] relabel_configs: - target_label: env replacement: dev - job_name: 'eureka' file_sd_configs: # 基於文件的服務發現 - files: - "/app/enmonster/basic/prometheus/prometheus-2.2.1.linux-amd64/eureka.json" # 支持json 和yml 兩種格式 refresh_interval: 30s # 30s鍾自行刷新配置,讀取文件,修改以後無需手動reload relabel_configs: - source_labels: [__job_name__] regex: (.*) target_label: job replacement: ${1} - target_label: env replacement: dev
告警規則配置文件示例:git
[root@host40 monitor-bak]# cat prometheus/rules/docker_monitor.yml groups:
name: "container monitor"
rules:
基於文件的服務發現定義文件: *.yml
[root@host40 monitor]# cat prometheus/sd_files/virtual_lan.yml - targets: ['10.10.11.179:9100'] - targets: ['10.10.11.178:9100']
[root@host40 monitor]# cat prometheus/sd_files/tcp.yml - targets: ['10.10.11.178:8001'] labels: server_name: http_download - targets: ['10.10.11.178:3307'] labels: server_name: xiaojing_db - targets: ['10.10.11.178:3001'] labels: server_name: test_web
node_exporter 在被監控節點安裝,抓取主機監控信息,並對外提供http服務,供prometheus抓取監控信息。
下載並解壓
wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz tar xf node_exporter-0.18.1.linux-amd64.tar.gz -C /usr/local/ cd /usr/local ln -sv node_exporter-0.18.1.linux-amd64/ node_exporter
建立用戶:
useradd -r -m -d /var/lib/prometheus prometheus
配置unit file:
vim /usr/lib/systemd/system/node_exporter.service [Unit] Description=Prometheus exporter for machine metrics, written in Go with pluggable metric collectors.Documentation=https://github.com/prometheus/node_exporterAfter=network.target [Service] EnvironmentFile=-/etc/sysconfig/node_exporter User=prometheus ExecStart=/usr/local/node_exporter/node_exporter \ $NODE_EXPORTER_OPTS Restart=on-failure StartLimitInterval=1 RestartSec=3 [Install] WantedBy=multi-user.target
啓動服務:
systemctl daemon-reload systemctl start node_exporter.service
能夠手動測試是否能夠獲取metrics信息:
curl http://localhost:9100/metrics
開啓防火牆:
iptables -I INPUT -p tcp --dport 9100 -s NET/MASK -j ACCEPT
image: quay.io/prometheus/node-exporter,prom/node-exporter
啓動命令:
docker run -d --net="host" --pid="host" -v "/:/host:ro,rslave" --name monitor-node-exporter --restart always quay.io/prometheus/node-exporter --path.rootfs=/host --web.listen-address=:9100
對於部分低版本的docker,出現報錯:Error response from daemon: linux mounts: Could not find source mount of /
解決辦法:-v "/:/host:ro,rslave" -> -v "/:/host:ro"
開啓關閉collectors:
./node_exporter --help # 查看支持的全部collectors,可根據實際需求 enable 和 disabled 各項指標收集
如 --collector.cpu=disabled ,再也不收集cpu相關信息
Textfile Collector: 文本文件收集器
經過 啓動參數 --collector.textfile.directory="DIR" 可開啓文本文件收集器 收集器會收集目錄下全部*.prom的文件中的指標,指標必須知足 prom格式
示例:
echo my_batch_job_completion_time $(date +%s) > /path/to/directory/my_batch_job.prom.$$ mv /path/to/directory/my_batch_job.prom.$$ /path/to/directory/my_batch_job.prom echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$ mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom rpc_duration_seconds{quantile="0.5"} 4773 http_request_duration_seconds_bucket{le="0.5"} 129389
即若是node_exporter 不能知足自身指標抓取,能夠經過腳本形式將指標抓取以後寫入文件,由node_exporter對外提供個prometheus抓取
能夠省掉pushgateway
示例: prometheus.yml
scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
static_configs:
job_name: 'nodes'
static_configs:
- job_name: 'node_real_lan' file_sd_configs: - files:
sudo docker run \ --volume=/:/rootfs:ro \ --volume=/var/run:/var/run:ro \ --volume=/sys:/sys:ro \ --volume=/var/lib/docker/:/var/lib/docker:ro \ --volume=/dev/disk/:/dev/disk:ro \ --publish=9080:8080 \ --detach=true \ --name=cadvisor \ --privileged \ --device=/dev/kmsg \ google/cadvisor:v0.33.0
配置示例:
- job_name: 'docker' static_configs:
下載並安裝
wget https://dl.grafana.com/oss/release/grafana-7.2.2-1.x86_64.rpm sudo yum install grafana-7.2.2-1.x86_64.rpm
準備service 文件:
[Unit] Description=Grafana instance Documentation=http://docs.grafana.org Wants=network-online.target After=network-online.target After=postgresql.service mariadb.service mysqld.service [Service] EnvironmentFile=/etc/sysconfig/grafana-server User=grafana Group=grafana Type=notify Restart=on-failure WorkingDirectory=/usr/share/grafana RuntimeDirectory=grafana RuntimeDirectoryMode=0750 ExecStart=/usr/sbin/grafana-server \ --config=${CONF_FILE} \ --pidfile=${PID_FILE_DIR}/grafana-server.pid\ --packaging=rpm \ cfg:default.paths.logs=${LOG_DIR} \ cfg:default.paths.data=${DATA_DIR} \ cfg:default.paths.plugins=${PLUGINS_DIR} \ cfg:default.paths.provisioning=${PROVISIONING_CFG_DIR} LimitNOFILE=10000 TimeoutStopSec=20 [Install] WantedBy=multi-user.target
啓動grafana
systemctl enable grafana-server.service systemctl restart grafana-server.service
默認監聽3000端口
開啓防火牆:
iptables -I INPUT -p tcp --dport 3000 -s NET/MASK -j ACCEPT
image: grafana/grafana
docker run -d --name=grafana -p 3000:3000 grafana/grafana:7.2.2
web頁面訪問:
http://ip:port
首次登錄會要求自行設置帳號密碼
7.2版本會要求輸入帳號密碼以後重置,初始帳號密碼都是admin
使用流程:
經常使用模板編號:
重置管理員密碼:
查看Grafana配置文件,肯定grafana.db的路徑 配置文件路徑:/etc/grafana/grafana.ini [paths] ;data = /var/lib/grafana [database] # For "sqlite3" only, path relative to data_path setting ;path = grafana.db 經過配置文件得知grafana.db的完整路徑以下: /var/lib/grafana/grafana.db
使用sqlites修改admin密碼 sqlite3 /var/lib/grafana/grafana.db sqlite> update user set password = '59acf18b94d7eb0694c61e60ce44c110c7a683ac6a8f09580d626f90f4a242000746579358d77dd9e570e83fa24faa88a8a6', salt = 'F3FAxVm33R' where login = 'admin'; .exit
使用admin admin 登陸
grafana-server配置 smtp服務器,配置發件郵箱
vim /etc/grafana/grafana.ini [smtp] enabled = true host = smtp.126.com:465 user = USER@126.com password = PASS skip_verify = false from_address = USER@126.com from_name = Grafana Alart
grafana頁面添加Notification Channel
Alerting -> Notification Channel save以前 能夠send test
進入dashboard,添加alart rules
prometheus用來查詢數據庫的語法規則,用來將數據庫中存儲的由各exporter 採集到的metric指標組織成可視化的圖標信息,以及告警規則
gauges: 返回單一數值,如:
node_boot_time_seconds{instance="10.10.11.40:9100",job="node_real_lan"} 1574040030
counters: 計數,
histograms: 直方圖,統計數據的分佈狀況。好比最大值,最小值,中間值,中位數,百分位數等。
node_boot_time_seconds{instance="10.10.11.40:9100",job="node_real_lan"}
如上示例,這裏的instance,和job 就是label
也能夠在配置文件自行定義label,如:
- targets: ['10.10.11.178:3001'] labels: server_name: test_web
添加的label即會在prometheus查詢數據使用:
metric{servername=...,}
計算cpu使用率:
(1-((sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance))/(sum(increase(node_cpu_seconds_total[1m])) by (instance)))) * 100
其中metric:
node_cpu_seconds_total # 總cpu 使用時間 node_cpu_seconds_total{mode="idle"} # 空閒cpu使用時間,其餘相似標籤: user , system , steal , softirq , irq , nice , iowait , idle
用到的函數:
increase( [1m]) # 1分鐘之類的增量。 sum() sum() by (TAG) # 其中 TAG 是標籤,此地 instance 表明的是機器名. 按主機名進行相加,不然多主機只會顯示一條線。
匹配運算:
= #等於 Select labels that are exactly equal to the provided string. != #不等於 Select labels that are not equal to the provided string. =~ #正則表達式匹配 Select labels that regex-match the provided string. !~ #正則表達式不匹配 Select labels that do not regex-match the provided string.
示例:
node_cpu_seconds_total{mode="idle"} # mode : 標籤,metric自帶屬性。 api_http_requests_total{method="POST", handler="/messages"}
http_requests_total{environment=~"staging|testing|development",method!="GET"}
注意: 必須指定一個名稱或至少一個與空字符串不匹配的標籤匹配器
{job=~".*"} # Bad! {job=~".+"} # Good! {job=~".*",method="get"} # Good!
時間範圍:
s -秒 m - 分鐘 h - 小時 d - 天 w -周 y -年
運算符:
+ (addition) - (subtraction) * (multiplication) / (division) % (modulo) ^ (power/exponentiatio == (equal) != (not-equal) > (greater-than) < (less-than) >= (greater-or-equal) <= (less-or-equal) and (intersection) or (union) unless (complement)
集合運算符:
sum (calculate sum over dimensions) min (select minimum over dimensions) max (select maximum over dimensions) avg (calculate the average over dimensions) stddev (calculate population standard deviation over dimensions) stdvar (calculate population standard variance over dimensions) count (count number of elements in the vector) count_values (count number of elements with the same value) bottomk (smallest k elements by sample value) topk (largest k elements by sample value) quantile (calculate φ-quantile (0 ≤ φ ≤ 1) over dimensions)
sum() by (instance) #求和(根據條件求和)
increase() # 取增量,針對counter類型
示例:
increase(node_network_receive_bytes_total[30s]) # 接受流量
rate() # 專門搭配counter類型數據使用的函數,按照設置的一個時間段,取counter在這個時間段中的平均每秒的增量
示例:
rate(node_network_receive_bytes_total[30s])*8 # 入口帶寬
topk() # 給定數字x,根據數值排序以後去最高的x個數
示例:
topk(5,node_cpu_seconds_total) # 取node_cpu_seconds_total 最長的前5個 topk(5,increase(node_network_receive_bytes_total[10m])) # 10m鍾以內收到的流量前5
注意:
會形成圖像散列。 console中執行,取一次值。
count() # 計數,如 count(node_load1 > 5)
avg() by () # 取均值,by(label)
配置規則以將抓取的數據彙總到新的時間序列中
示例:
將如下規則,記錄進prometheus.rules.yml文件
avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
記錄進prometheus.rules.yml文件中
groups: - name: example rules:
record: job_service:rpc_durations_seconds_count:avg_rate5m
expr: avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
在prometheus.yml文件中:
rule_files:
至關於生產了一個新的matric,不過此matric不是抓取來的,而是計算來的。
文檔地址:https://prometheus.io/docs/alerting/latest/configuration/
下載並安裝:
wget https://github.com/prometheus/alertmanager/releases/download/v0.20.0/ alertmanager-0.20.0.linux-amd64.tar.gz tar -xf alertmanager-0.20.0.linux-amd64.tar.gz -C /usr/local cd /usr/local && ln -sv alertmanager-0.20.0.linux-amd64/ alertmanager && cd alertmanager
啓動:
nohup ./alertmanager --config.file="alertmanager.yml" --storage.path="data/ --web.listen-address=":9093" &
image: prom/alertmanager
docker run
docker run -dit --name monitor-alertmanager -v ./alertmanager/db/:/alertmanager -v ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml ./alertmanager/templates/:/etc/alertmanager/templates -p 9093:9093 --restart always --privileged true prom/alertmanager --config.file="/etc/alertmanager/alertmanager.yml" --storage.path="/alertmanager --web.listen-address=":9093"
grouping: 分組
示例:
發生網絡分區時,羣集中正在運行數十個或數百個服務實例。您有一半的服務實例再也不能夠訪問數據庫。 Prometheus中的警報規則配置爲在每一個服務實例沒法與數據庫通訊時爲其發送警報。結果,數百個警報被髮送到Alertmanager。 做爲用戶,人們只但願得到一個頁面,同時仍然可以準確查看受影響的服務實例。所以,能夠將Alertmanager配置爲按警報的羣集和 警報名稱分組警報,以便它發送一個緊湊的通知。
Inhibition: 抑制
抑制是一種概念,若是某些其餘警報已經觸發,則抑制某些警報的通知。
正在觸發警報,通知您沒法訪問整個羣集。能夠將Alertmanager配置爲使與該羣集有關的全部其餘警報靜音。這樣能夠防止與實際問題無關的數百或數千個觸發警報的通知。
Silences: 靜默
alerting:
alerting: alertmanagers:
static_configs:
rule_files:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files:
"rules/*.yml"
scrape_configs:
scrape_configs:
示例:
[root@xiang-03 /usr/local/prometheus]#cat rules/node.yml groups:
name: "system info"
rules:
主配置文件中須要配置:
先看示例:
vim alertmanager.yml global: smtp_smarthost: 'xxx' smtp_from: 'xxx' smtp_auth_username: 'xxx' smtp_auth_password: 'xxx' smtp_require_tls: false templates:
to: 'xxx@xx.xx'
html: '{{ template "xx.html" . }}'
headers: { Subject: " {{ 第二路由匹配測試}}" }
vim test.tmpl
{{ define "xx.html" }}
<table border="5">
<tr><td>報警項</td>
<td>磁盤</td>
<td>報警閥值</td>
<td>開始時間</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr><td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
gloable:
resolve_timeout: # 在沒有報警的狀況下聲明爲已解決的時間
route: # 全部報警信息進入後的根路由,用來設置報警的分發策略
group_by: ['LABEL_NAME','alertname', 'cluster','job','instance',...]
這裏的標籤列表是接收到報警信息後的從新分組標籤,例如,接收到的報警信息裏面有許多具備 cluster=A
和alertname=LatncyHigh 這樣的標籤的報警信息將會批量被聚合到一個分組裏面
group_wait: 30s
當一個新的報警分組被建立後,須要等待至少group_wait時間來初始化通知,這種方式能夠確保您能有足夠的時間爲同一分組來獲取多個警報,而後一塊兒觸發這個報警信息。
group_interval: 5m
當第一個報警發送後,等待'group_interval'時間來發送新的一組報警信息。
repeat_interval: 5m
若是一個報警信息已經發送成功了,等待'repeat_interval'時間來從新發送他們
match: label_name: NAME
匹配報警規則,知足條件的告警將被髮給 receiver
match_re: label_name: <regex>, ...
正則表達式匹配。知足條件的告警將被髮給 receiver
receiver: receiver_name
將知足match 和 match_re的告警發給後端 告警媒介(郵件,webhook,pagerduty,wechat,...)
必須有一個default receivererr="root route must specify a default receiver"
routes: - <route> ...
配置多條規則。
templates: [ - <filepath> ... ]
配置模板,好比郵件告警頁面模板
receivers: - <receiver> ...# 列表
- name: receiver_name # 用於填寫在route.receiver中的名字
email_configs: # 配置郵件告警
- to: <tmpl_string> send_resolved: <boolean> | default = false # 故障恢復以後,是否發送恢復通知
配置接受郵件告警的郵箱,也能夠配置單獨配置發件郵箱。 詳見官方文檔
https://prometheus.io/docs/alerting/latest/configuration/#email_config
- name: ... wechat_configs: - send_resolved: <boolean> | default = false api_secret: <secret> | default = global.wechat_api_secret api_url: <string> | default = global.wechat_api_url corp_id: <string> | default = global.wechat_api_corp_id message: <tmpl_string> | default = '{{ template "wechat.default.message" . }}' agent_id: <string> | default = '{{ template "wechat.default.agent_id" . }}' to_user: <string> | default = '{{ template "wechat.default.to_user" . }}' to_party: <string> | default = '{{ template "wechat.default.to_party" . }}' to_tag: <string> | default = '{{ template "wechat.default.to_tag" . }}' # 說明 to_user: 企業微信用戶ID to_party: 須要發送的組id corp_id: 企業微信帳號惟一ID 能夠在 個人企業 查看 agent_id: 應用的 ID,應用管理 --> 打開自定應用查看 api_secret: 應用的密鑰 打開企業微信註冊 https://work.weixin.qq.com 微信API官方文檔 https://work.weixin.qq.com/api/doc#90002/90151/90854
企業微信告警配置
inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
抑制相關配置
註冊企業: https://work.weixin.qq.com
能夠註冊未認證企業,人數上限200,綁定我的微信便可使用web後臺
微信API官方文檔 : https://work.weixin.qq.com/api/doc#90002/90151/90854
註冊以後綁定私人微信便可掃碼進入管理後臺。
發送告警的應用須要新建,操做也很簡單
須要注意的參數:
receivers: - name: 'default' email_configs: - to: 'XXX' send_resolved: true wechat_configs: - send_resolved: true corp_id: 'XXX' api_secret: 'XXX' agent_id: 1000002 to_user: XXX to_party: 2 message: '{{ template "wechat.html" . }}'
template:
因爲alertmanager默認的微信報警模板太醜醜陋和冗長,因此使用告警模板,郵件模板默認的卻是還能夠
cat wechat.tmpl {{ define "wechat.html" }} {{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }} [@警報~] 實例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 詳情: {{ .Annotations.description }} 值: {{ .Annotations.value }} 時間: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }} [@恢復~] 實例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 時間: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} 恢復: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- end }}
參考來源: https://blog.csdn.net/knight_zhou/article/details/106323719
Prometheus 郵件告警自定義模板的默認使用的是utc時間。
觸發時間: {{ .StartsAt.Format "2020-01-02 15:04:05" }} 修改以後:{{ (.StartsAt.Add 28800e9).Format "2020-01-02 15:04:05" }}
vim rules/docker_monitor.yml groups: - name: "container monitor" rules: - alert: "Container down: env1" expr: time() - container_last_seen{name="env1"} > 60 for: 30s labels: severity: critical annotations: summary: "Container down: {{$labels.instance}} name={{$labels.name}}"
注意:
此項指標只能監控容器down 掉,沒法準確監控容器恢復(不許),即使容器沒有成功啓動,過一段時間,也會受到resolve通知
groups: - name: 主機狀態-監控告警 rules: - alert: 主機狀態 expr: up == 0 for: 1m labels: status: 很是嚴重 annotations: summary: "{{$labels.instance}}:服務器宕機" description: "{{$labels.instance}}:服務器延時超過5分鐘" - alert: CPU使用狀況 expr: 100-(avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) for: 1m labels: status: 通常告警 annotations: summary: "{{$labels.mountpoint}} CPU使用率太高!" description: "{{$labels.mountpoint }} CPU使用大於60%(目前使用:{{$value}}%)" - alert: cpu使用率太高告警 # 查詢提供了hostname label expr: (100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 10 nodename) (node_uname_info) > 85 for: 5m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})CPU使用率太高!" description: '服務器{{$labels.instance}}({{$labels.nodename}})CPU使用率超過85%( $value}}%)' - alert: 系統負載太高 expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"} nodename) (node_uname_info)>1.1 for: 3m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})系統負載太高!" description: '{{$labels.instance}}({{$labels.nodename}})當前負載超標率 {{printf - alert: 內存不足告警 expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* o nodename) (node_uname_info) > 80 for: 3m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})內存使用率太高!" description: '服務器{{$labels.instance}}({{$labels.nodename}})內存使用率超過80%( $value}}%)' - alert: IO操做耗時 expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流入磁盤IO使用率太高!" description: "{{$labels.mountpoint }} 流入磁盤IO大於60%(目前使用:{{$value}})" - alert: 網絡流入 expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|do instance)) / 100) > 102400 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流入網絡帶寬太高!" description: "{{$labels.mountpoint }}流入網絡帶寬持續2分鐘高於100M. RX帶寬使用率{ - alert: 網絡流出 expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|d instance)) / 100) > 102400 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流出網絡帶寬太高!" description: "{{$labels.mountpoint }}流出網絡帶寬持續2分鐘高於100M. RX帶寬使用率{ - alert: network in expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1 for: 1m labels: name: network severity: Critical annotations: summary: "{{$labels.mountpoint}} 流入網絡帶寬太高" description: "{{$labels.mountpoint }}流入網絡異常,高於100M" value: "{{ $value }}" - alert: network out expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / for: 1m labels: name: network severity: Critical annotations: summary: "{{$labels.mountpoint}} 發送網絡帶寬太高" description: "{{$labels.mountpoint }}發送網絡異常,高於100M" value: "{{ $value }}" - alert: TCP會話 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} TCP_ESTABLISHED太高!" description: "{{$labels.mountpoint }} TCP_ESTABLISHED大於1000%(目前使用:{{$valu - alert: 磁盤容量 expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_b > 80 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 磁盤分區使用率太高!" description: "{{$labels.mountpoint }} 磁盤分區使用大於80%(目前使用:{{$value}}%)" - alert: 硬盤空間不足告警 # 查詢結果多了hostname等label expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_by )* on(instance) group_left(nodename) (node_uname_info)> 80 for: 3m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})硬盤使用率太高!" description: '服務器{{$labels.instance}}({{$labels.nodename}})硬盤使用率超過80%( $value}}%)' - alert: volume fullIn fourdaysd # 預計磁盤4天后寫滿 expr: predict_linear(node_filesystem_free_bytes[2h], 4 * 24 * 3600) < 0 for: 5m labels: name: disk severity: Critical annotations: summary: "{{$labels.mountpoint}} 預計主機可用磁盤空間4天后將寫滿" description: "{{$labels.mountpoint }}" value: "{{ $value }}%" - alert: disk write rate expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 for: 1m labels:container_memory_max_usage_bytes name: disk severity: Critical annotations: summary: "disk write rate (instance {{ $labels.instance }})" description: "磁盤寫入速率大於50MB/s" value: "{{ $value }}%" - alert: disk read latency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_complet for: 1m labels: name: disk severity: Critical annotations: summary: "unusual disk read latency (instance {{ $labels.instance }})" description: "磁盤讀取延遲大於100毫秒" value: "{{ $value }}%" - alert: disk write latency expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_compl for: 1m labels: name: disk severity: Critical annotations: summary: "unusual disk write latency (instance {{ $labels.instance }})" description: "磁盤寫入延遲大於100毫秒" value: "{{ $value }}%"
GET /-/healthy GET /-/ready POST /-/reload
curl -u monitor:fosafer.com 127.0.0.1:9093/-/healthy OK curl -XPOST -u monitor:fosafer.com 127.0.0.1:9093/-/reload [root@host40 monitor]# curl -XPOST -u monitor:fosafer.com 127.0.0.1:9093/-/reload failed to reload config: yaml: unmarshal errors: line 26: field receiver already set in type config.plain
等同: docker exec -it monitor-alertmanager kill -1 1 ,可是失敗會報錯
blackbox_exporter是Prometheus 官方提供的 exporter 之一,能夠提供 http、dns、tcp、icmp 的監控數據採集。
應用場景:
HTTP 測試 定義 Request Header 信息 判斷 Http status / Http Respones Header / Http Body 內容 TCP 測試 業務組件端口狀態監聽 應用層協議定義與監聽 ICMP 測試 主機探活機制 POST 測試 接口聯通性 SSL 證書過時時間
下載並解壓
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/ blackbox_exporter-0.18.0.linux-amd64.tar.gz tar -xf blackbox_exporter-0.18.0.linux-amd64.tar.gz -C /usr/local/ cd /usr/local ln -sv blackbox_exporter-0.18.0.linux-amd64 blackbox_exporter cd blackbox_exporter ./blackbox_exporter --version
添加systemd服務unit:
vim /lib/systemd/system/blackbox_exporter.service [Unit] Description=blackbox_exporter After=network.target [Service] User=root Type=simple ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml Restart=on-failure [Install] WantedBy=multi-user.target
systemctl daemon-reload systemctl enable blackbox_exporter systemctl start blackbox_exporter
image: prom/blackbox-exporter:master
docker run:
docker run --rm -d -p 9115:9115 --name blackbox_exporter -v `pwd`:/config prom/blackbox-exporter:master --config.file=/config/blackbox.yml
默認配置文件:
blackbox_exporter 默認狀況配置文件已經可以知足大多數需求,後續如需自行配置,參見官方文檔,以及項目類一個示例配置文件
cat blackbox.yml modules: http_2xx: prober: http http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
官方介紹: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
說明:
labels: job: job_name __address__: <host>:<port> instance: 默認__address__,若是沒有被從新標籤的話 __scheme__: scheme __metrics_path__: path __param_<name>: url 中第一個出現的 <name> 參數
scrape_configs: - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - http://prometheus.io # Target to probe with http. - https://prometheus.io# Target to probe with https. - http://example.com:8080 # Target to probe with http on port 8080. relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: "blackbox_telnet_port]" scrape_interval: 5s metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: [ '1x3.x1.xx.xx4:443' ] labels: group: 'xxxidc機房ip監控' - targets: ['10.xx.xx.xxx:443'] labels: group: 'Process status of nginx(main) server' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 10.xxx.xx.xx:9115
- job_name: 'blackbox00_ping_idc_ip' scrape_interval: 10s metrics_path: /probe params: module: [icmp] #ping static_configs: - targets: [ '1x.xx.xx.xx' ] labels: group: 'xxnginx 虛擬IP' relabel_configs: - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target replacement: ${1} - source_labels: [__param_target] regex: (.*) target_label: ping replacement: ${1} - source_labels: [] regex: .* target_label: __address__ replacement: 1x.xxx.xx.xx:9115
- job_name: 'blackbox_http_2xx_post' scrape_interval: 10s metrics_path: /probe params: module: [http_post_2xx_query] static_configs: - targets: - https://xx.xxx.com/api/xx/xx/fund/query.action labels: group: 'Interface monitoring' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 1x.xx.xx.xx:9115 # The blackbox exporter's real hostname:port.
cat << 'EOF' > prometheus.yml rule_files: - ssl_expiry.rules scrape_configs: - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - example.com # Target to probe relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # Blackbox exporter. EOF cat << 'EOF' > ssl_expiry.rules groups: - name: ssl_expiry.rules rules: - alert: SSLCertExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 30 for: 10m EOF
相似於:
curl http://172.16.10.65:9115/probe?target=prometheus.io&module=http_2xx&debug=true
icmp、tcp、http、post 監測是否正常能夠觀察probe_success 這一指標
probe_success == 0 ##聯通性異常 probe_success == 1 ##聯通性正常
告警也是判斷這個指標是否等於0,如等於0 則觸發異常報警
[sss@prometheus01 prometheus]$ cat rules/blackbox-alert.rules groups: - name: blackbox_network_stats rules:
prometheus alertmanager grafana nginx node_exporter cadvisor blackbox_exporter
prom/prometheus prom/alertmanager quay.io/prometheus/node-exporter ,prom/node-exporter gcr.io/google_containers/cadvisor[:v0.36.0] # 須要能訪問google google/cadvisor:v0.33.0 # docker hub鏡像,版本沒有google的新 grafana/grafana nginx
將iamge pull下來以後重新tag ,並上傳至本地harbor 倉庫
image: 10.10.11.40:80/base/nginx:1.19.3 image: 10.10.11.40:80/base/prometheus:2.22.0 image: 10.10.11.40:80/base/grafana:7.2.2 image: 10.10.11.40:80/base/alertmanager:0.21.0 image: 10.10.11.40:80/base/node_exporter:1.0.1 image: 10.10.11.40:80/base/cadvisor:v0.33.0 image: 10.10.11.40:80/base/blackbox-exporter:0.18.0
目錄結構一覽
mkdir /home/deploy/monitor cd /home/deploy/monitor
[root@host40 monitor]# tree . ├── alertmanager │ ├── alertmanager.yml │ ├── db │ │ ├── nflog │ │ └── silences │ └── templates │ └── wechat.tmpl ├── blackbox_exporter │ └── blackbox.yml ├── docker-compose.yml ├── grafana │ └── db │ ├── grafana.db │ ├── plugins ... ├── nginx │ ├── auth │ └── nginx.conf ├── node-exporter │ └── textfiles ├── node_exporter_install_docker.sh ├── prometheus │ ├── db │ ├── prometheus.yml │ ├── rules │ │ ├── docker_monitor.yml │ │ ├── system_monitor.yml │ │ └── tcp_monitor.yml │ └── sd_files │ ├── docker_host.yml │ ├── http.yml │ ├── icmp.yml │ ├── real_lan.yml │ ├── real_wan.yml │ ├── sedFDm5Rw │ ├── tcp.yml │ ├── virtual_lan.yml │ └── virtual_wan.yml └── sd_controler.sh
nginx basic認證須要的文件:
[root@host40 monitor-bak]# ls nginx/auth/ -a . .. .htpasswd
部分掛在目錄權限:
prometheus,grafana,alertmanager 的 db目錄 須要777權限 單獨掛在的配置文件 alertmanager.yml,prometheus.yml,nginx.conf 須要 666權限。 若是爲了安全起見,建議將配置文件放入專門目錄中掛載,並在command 中修改啓動參數指定配置文件便可
[root@host40 monitor-bak]# cat docker-compose.yml version: "3" services: nginx: image: 10.10.11.40:80/base/nginx:1.19.3 hostname: nginx container_name: monitor-nginx restart: always privileged: false ports: - 3001:3000 - 9090:9090 - 9093:9093 volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf - ./nginx/auth:/etc/nginx/basic_auth networks: monitor: aliases: - nginx logging: driver: json-file options: max-file: '5' max-size: 50m prometheus: image: 10.10.11.40:80/base/prometheus:2.22.0 container_name: monitor-prometheus hostname: prometheus restart: always privileged: true volumes: - ./prometheus/db/:/prometheus/ - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - ./prometheus/rules/:/etc/prometheus/rules/ - ./prometheus/sd_files/:/etc/prometheus/sd_files/ command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' - '--storage.tsdb.retention=60d' networks: monitor: aliases: - prometheus logging: driver: json-file options: max-file: '5' max-size: 50m grafana: image: 10.10.11.40:80/base/grafana:7.2.2 container_name: monitor-grafana hostname: grafana restart: always privileged: true volumes: - ./grafana/db/:/var/lib/grafana networks: monitor: aliases: - grafana logging: driver: json-file options: max-file: '5' max-size: 50m alertmanger: image: 10.10.11.40:80/base/alertmanager:0.21.0 container_name: monitor-alertmanager hostname: alertmanager restart: always privileged: true volumes: - ./alertmanager/db/:/alertmanager - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml - ./alertmanager/templates/:/etc/alertmanager/templates networks: monitor: aliases: - alertmanager logging: driver: json-file options: max-file: '5' max-size: 50m node-exporter: image: 10.10.11.40:80/base/node_exporter:1.0.1 container_name: monitor-node-exporter hostname: host40 restart: always privileged: true volumes: - /:/host:ro,rslave - ./node-exporter/textfiles/:/textfiles network_mode: "host" command: - '--path.rootfs=/host' - '--web.listen-address=:9100' - '--collector.textfile.directory=/textfiles' logging: driver: json-file options: max-file: '5' max-size: 50m cadvisor: image: 10.10.11.40:80/base/cadvisor:v0.33.0 container_name: monitor-cadvisor hostname: cadvisor restart: always privileged: true volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro ports: - 9080:8080 networks: monitor: logging: driver: json-file options: max-file: '5' max-size: 50m blackbox_exporter: image: 10.10.11.40:80/base/blackbox-exporter:0.18.0 container_name: monitor-blackbox hostname: blackbox-exporter restart: always privileged: true volumes: - ./blackbox_exporter/:/etc/blackbox_exporter networks: monitor: aliases: - blackbox command: - '--config.file=/etc/blackbox_exporter/blackbox.yml' logging: driver: json-file options: max-file: '5' max-size: 50m networks: monitor: ipam: config: - subnet: 192.168.17.0/24
因爲prometheus,alertmanager 自己不帶認證功能,因此前端使用nginx完成調度和basic auth 認證,同一代理後端監聽端口,便於管理。
prometheus: 9090 grafana:3000 alertmanager: 9093 node_exproter: 9100 cadvisor: 8080 (客戶端)
echo monitor:`openssl passwd -crypt 123456` > .htpasswd
單獨掛在配置文件容器不更新:(固然也能夠選擇掛在目錄,而不是直接掛在文件)
chmod 666 nginx.conf
nginx容器加載配置文件:
docker exec -it web-director nginx -s reload
nginx.conf
[root@host40 monitor-bak]# cat nginx/nginx.conf user nginx; worker_processes auto; error_log /var/log/nginx/error.log; pid /run/nginx.pid; include /usr/share/nginx/modules/*.conf; events { worker_connections 10240; } http { log_format main '$remote_addr - $remote_user [$time_local] "$request" ' '$status $body_bytes_sent "$http_referer" ' '"$http_user_agent" "$http_x_forwarded_for"'; access_log /var/log/nginx/access.log main; sendfileon; tcp_nopush on; tcp_nodelayon; keepalive_timeout65; types_hash_max_size 2048; include /etc/nginx/mime.types; default_type application/octet-stream;
proxy_connect_timeout500ms;
proxy_send_timeout1000ms;
proxy_read_timeout3000ms;
proxy_buffers 64 8k;
proxy_busy_buffers_size 128k;
proxy_temp_file_write_size 64k;
proxy_redirect off;
proxy_next_upstream error invalid_header timeout http_502 http_504;
proxy_http_version 1.1;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Real-Port $remote_port;
proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
client_max_body_size 10m;
client_body_buffer_size 512k;
client_body_timeout 180;
client_header_timeout 10;
send_timeout 240;
gzip on;
gzip_min_length 1k;
gzip_buffers 4 16k;
gzip_comp_level 2;
gzip_types application/javascript application/x-javascript text/css text/javascript image/jpeg image/gif image/png;
gzip_vary off;
gzip_disable "MSIE [1-6].";
server {
listen 3000;
servername ;
location / {
proxy_pass http://grafana:3000;
}
}
server {
listen 9090;
servername ;
location / {
auth_basic "auth for monitor";
auth_basic_user_file /etc/nginx/basic_auth/.htpasswd;
proxy_pass http://prometheus:9090;
}
}
server {
listen 9093;
servername ;
location / {
auth_basic "auth for monitor";
auth_basic_user_file /etc/nginx/basic_auth/.htpasswd;
proxy_pass http://alertmanager:9093;
}
}
}
### 9.5 prometheus - 注意db目錄需可寫,給777權限 #### 9.5.1 主配置文件: prometheus.yml
[root@host40 monitor-bak]# cat prometheus/prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
rule_files:
scrape_configs:
job=<job_name>
to any timeseries scraped from this config.job_name: 'prometheus'
static_configs:
job_name: 'node_real_lan'
file_sd_configs:
job_name: 'node_virtual_lan'
file_sd_configs:
job_name: 'node_real_wan'
file_sd_configs:
job_name: 'node_virtual_wan'
file_sd_configs:
ls prometheus/sd_files/ docker_host.yml http.yml icmp.yml real_lan.yml real_wan.yml sedFDm5Rw tcp.yml virtual_lan.yml virtual_wan.yml
cat prometheus/sd_files/docker_host.yml - targets: ['10.10.11.178:9080'] - targets: ['10.10.11.99:9080'] - targets: ['10.10.11.40:9080'] - targets: ['10.10.11.35:9080'] - targets: ['10.10.11.45:9080'] - targets: ['10.10.11.46:9080'] - targets: ['10.10.11.48:9080'] - targets: ['10.10.11.47:9080'] - targets: ['10.10.11.65:9081'] - targets: ['10.10.11.61:9080'] - targets: ['10.10.11.66:9080'] - targets: ['10.10.11.68:9080'] - targets: ['10.10.11.98:9080'] - targets: ['10.10.11.75:9080'] - targets: ['10.10.11.97:9080'] - targets: ['10.10.11.179:9080']
cat prometheus/sd_files/tcp.yml - targets: ['10.10.11.178:8001'] labels: server_name: http_download - targets: ['10.10.11.178:3307'] labels: server_name: xiaojing_db - targets: ['10.10.11.178:3001'] labels: server_name: test_web
cat prometheus/rules/docker_monitor.yml groups: - name: "container monitor" rules: - alert: "Container down: env1" expr: time() - container_last_seen{name="env1"} > 60 for: 30s labels: severity: critical annotations: summary: "Container down: {{$labels.instance}} name={{$labels.name}}"
tcp rules:
cat prometheus/rules/tcp_monitor.yml groups: - name: blackbox_network_stats rules:
alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} ,server-name: {{ $labels.server_name }} is down"
description: "鏈接不通..."
cat prometheus/rules/system_monitor.yml groups: - name: "system info" rules: - alert: "服務器宕機" expr: up == 0 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:服務器宕機" description: "{{$labels.instance}}:服務器沒法鏈接,持續時間已超過3mins" - alert: "系統負載太高" expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left( nodename) (node_uname_info) > 1.1 for: 3m labels: servirity: warning annotations: summary: "{{$labels.instance}}:系統負載太高" description: "{{$labels.instance}}:系統負載太高." value: "{{$value}}" - alert: "CPU 使用率超過90%" expr: 100-(avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 90 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:CPU 使用率90%" description: "{{$labels.instance}}:CPU 使用率超過90%." value: "{{$value}}" - alert: "內存使用率超過80%" expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* on(instance) group_left( nodename) (node_uname_info) > 80 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:內存使用率80%" description: "{{$labels.instance}}:內存使用率超過80%" value: "{{$value}}" - alert: "IO操做耗時超過60%" expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 40 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:IO操做耗時超過60%" description: "{{$labels.instance}}:IO操做耗時超過60%" value: "{{$value}}" - alert: "磁盤分區容量超過85" expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) )* on(instance) group_left(nodename) (node_uname_info)> 85 for: 3m labels: severity: longtime annotations: summary: "{{$labels.instance}}:磁盤分區容量超過85%" description: "{{$labels.instance}}:磁盤分區容量超過85%" value: "{{$value}}" - alert: "磁盤將在4天后寫滿" expr: predict_linear(node_filesystem_free_bytes[2h], 4 * 24 * 3600) < 0 for: 3m labels: severity: longtime annotations: summary: "{{$labels.instance}}: 預計將有磁盤分區在4天后寫滿," description: "{{$labels.instance}}:預計將有磁盤分區在4天后寫滿," value: "{{$value}}"
注意db目錄可寫:
主配置文件:
cat alertmanager/alertmanager.yml global: resolve_timeout: 5m smtp_smarthost: 'smtphz.qiye.163.com:25' smtp_from: 'XXX@fosafer.com' smtp_auth_username: 'XXX@fosafer.com' smtp_auth_password: 'XXX' smtp_hello: 'qiye.163.com' smtp_require_tls: true route: group_by: ['instance'] group_wait: 30s receiver: default routes: - group_interval: 3m repeat_interval: 10m match: severiry: warning receiver: 'default' - group_interval: 3m repeat_interval: 30m match: severiry: critical receiver: 'default' - group_interval: 5m repeat_interval: 24h match: severiry: longtime receiver: 'default' templates:
name: 'default'
email_configs:
wechat_configs:
name: 'critical'
email_configs:
告警模板文件
cat alertmanager/templates/wechat.tmpl {{ define "wechat.html" }} {{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }} [@警報~] 實例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 詳情: {{ .Annotations.description }} 值: {{ .Annotations.value }} 時間: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }} [@恢復~] 實例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 時間: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} 恢復: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- end }}
安裝腳本:
http://10.10.11.178:8001/node_exporter_install.sh
安裝腳本:
http://10.10.11.178:8001/node_exporter_install_docker.sh
須要的image,對於沒有添加10.10.11.40:80 倉庫的docker主機,能夠下載save的image,先load image 在安裝
http://10.10.11.178:8001/monitor-client.tgz
全部的job都使用基於文件的服務發現,因此,只用將target寫入sd_file便可,無需重讀配置文件
基於此寫了一個文本處理腳本做爲sd_files的前端,經過命令行的形式添加和刪除targets,無需手動編輯文件
腳本名稱: sd_controler.sh
腳本使用:./sd_controler.sh 便可查看usage
完整腳本以下:
[root@host40 monitor]# cat sd_controler.sh #!/bin/bash #version: 1.0 #Description: add | del | show instance from|to prometheus file_sd_files. # rl | vl | dk | rw | vw | tcp | http | icmp : short for job name, each one means a sd_file. # tcp | http | icmp ( because with ports for service ) add with label (server_name by default) to easy read in alert emails. # each time can only add|del for one instance. #說明:用來添加、刪除、查看prometheus基於文件的服務發現中的條目。好比IP:PORT 組合。 # rl | vl | dk | rw | vw | tcp | http | icmp :這寫prometheus job名稱的簡稱,每一項表明一個job,操做一個sd_file 即job文件服務發現使用的文件。 # tcp | http | icmp,因爲經常沒法根據服務端口第一時間確認掛掉的是什麼服務,因此,在tcp http icmp(順帶)添加的時候要求帶上server_name的標籤label, #讓監控人員收到告警郵件第十時間知道掛掉的是什麼服務。 # 每一次只能添加、刪除一條記錄,若是須要批量添加,能夠直接使用vim 文本操做,或者寫for 語句批量執行。 ### vars SD_DIR=./prometheus/sd_files DOCKER_SD=$SD_DIR/docker_host.yml RL_HOST_SD=$SD_DIR/real_lan.yml VL_HOST_SD=$SD_DIR/virtual_lan.yml RW_HOST_SD=$SD_DIR/real_wan.yml VW_HOST_SD=$SD_DIR/virtual_wan.yml TCP_SD=$SD_DIR/tcp.yml HTTP_SD=$SD_DIR/http.yml ICMP_SD=$SD_DIR/icmp.yml SDFILE= ### funcs usage(){ echo -e "Usage: $0 < rl | vl | dk | rw | vw | tcp | http | icmp > < add | del | show > [ IP:PORT | FQDN ] [ server-name ]" echo -e " example: \n\t node add:\t $0 rl add | del 10.10.10.10:9100\n\t tcp,http,icmp add:\t $0 tcp add 10.10.10.10:3306 web-mysql\n\t del:\t $0 http del www.baidu.com\n\t show:\t $0 rl | vl | dk | rw | vw | tcp | http | icmp show." exit } add(){ # $1: SDFILE, $2: IP:PORT grep -q $2 $1 || echo -e "- targets: ['$2']" >> $1 } del(){ # $1: SDFILE, $2: IP:PORT sed -i '/'$2'/d' $1 } add_with_label(){ # $1: SDFILE, $2: [IP:[PROT]|FQDN] $3:SERVER-NAME LABEL_01="server_name" if ! grep -q '$2' $1;then echo -e "- targets: ['$2']" >> $1 echo -e " labels:" >> $1 echo -e " ${LABEL_01}: $3" >> $1 fi } del_with_label(){ # $1: SDFILE, $2: [IP:[PROT]|FQDN] NUM=`cat -n $SDFILE |grep "'$2'"|awk '{print $1}'` let ENDNUM=NUM+2
sed -i $NUM,${ENDNUM}d $1
}
action(){
if [ "$1" == "add" ];then
add $SDFILE $2
elif [ "$1" == "del" ];then
del $SDFILE $2
elif [ "$1" == "show" ];then
cat $SDFILE
fi
}
action_with_label(){
if [ "$1" == "add" ];then
add_with_label $SDFILE $2 $3
elif [ "$1" == "del" ];then
del_with_label $SDFILE $2 $3
elif [ "$1" == "show" ];then
cat $SDFILE
fi
}
[ "$2" == "" ] || [[ ! "$2" =~ ^(add|del|show)$ ]] && usage
curl --version &>/dev/null || { echo -e "no curl found. " && exit 15; }
if [[ $1 =~ ^(rl|vl|rw|vw|dk)$ ]] && [ "$2" == "add" ];then
[ "$3" == "" ] && usage
if [ "$4" != "-f" ];then
COOD=curl -IL -o /dev/null --retry 3 --connect-timeout 3 -s -w "%{http_code}" http://$3/metrics
[ "$COOD" != "200" ] && echo -e "http://$3/metrics is not arriable. check it again. or you can use -f to ignor it." && exit 11
fi
fi
if [[ $1 =~ ^(tcp|http|icmp)$ ]] && [ "$2" == "add" ];then
[ "$4" == "" ] && echo -e "監聽 tcp http icmp 服務時必須指明 server-name." && usage
fi
case $1 inrl)SDFILE=$RL_HOST_SDaction $2 $3 && echo $2 OK;;vl)SDFILE=$VL_HOST_SDaction $2 $3 && echo $2 OK;;dk)SDFILE=$DOCKER_SDaction $2 $3 && echo $2 OK;;rw)SDFILE=$RW_HOST_SDaction $2 $3 && echo $2 OK;;vw)SDFILE=$VW_HOST_SDaction $2 $3 && echo $2 OK;;tcp)SDFILE=$TCP_SDaction_with_label $2 $3 $4 && echo $2 OK;;http)SDFILE=$HTTP_SDaction_with_label $2 $3 $4 && echo $2 OK;; icmp)SDFILE=$ICMP_SDaction_with_label $2 $3 $4 && echo $2 OK;; *)usage;;esac