Prometheus是一個開源的系統監視和警報工具包,自2012成立以來,許多公司和組織採用了Prometheus。它如今是一個獨立的開源項目,並獨立於任何公司維護。
在2016年,Prometheus加入雲計算基金會做爲Kubernetes以後的第二託管項目。node
wget https://github.com/prometheus/prometheus/releases/download/v2.5.0/prometheus-2.5.0.linux-amd64.tar.gz groupadd prometheus useradd -g prometheus prometheus -d /app/prometheus tar -xvf prometheus-2.5.0.linux-amd64.tar.gz cd prometheus-2.5.0.linux-amd64/ mv * /app/prometheus/ cd /app/prometheus/ mkdir {data,cfg,logs,bin} -p mv prometheus promtool bin/ mv prometheus.yml cfg/ chown -R prometheus.prometheus *
vim /etc/profile PATH=/app/prometheus/bin:$PATH:$HOME/bin source /etc/profile
# promtool check config /app/prometheus/config/prometheus.yml Checking /app/prometheus/config/prometheus.yml SUCCESS: 0 rule files found
vim /etc/systemd/system/prometheus.service [Unit] Description=Prometheus Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/app/prometheus/bin/prometheus --config.file=/app/prometheus/cfg/prometheus.yml --storage.tsdb.path=/app/prometheus/data Restart=on-failure [Install] WantedBy=multi-user.target
systemctl daemon-reload systemctl enable prometheus.service systemctl start prometheus.service
# systemctl status prometheus.service ● prometheus.service - Prometheus Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; vendor preset: disabled) Active: active (running) since 日 2018-12-09 22:21:52 CST; 4min 59s ago Docs: https://prometheus.io/ Main PID: 1308 (prometheus) CGroup: /system.slice/prometheus.service └─1308 /app/prometheus/bin/prometheus --config.file=/app/prometheus/cfg/prometheus.yml --storage.tsdb.path=/app/prometheus/data 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190312051Z caller=main.go:245 build_context="(go=go1.11.1, user=root@578ab108d0b9, date=20...-11:40:44)" 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190327105Z caller=main.go:246 host_details="(Linux 3.10.0-862.el7.x86_64 #1 SMP Fri Apr 20...us (none))" 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190342191Z caller=main.go:247 fd_limits="(soft=1024, hard=4096)" 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190351846Z caller=main.go:248 vm_limits="(soft=unlimited, hard=unlimited)" 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.192559162Z caller=main.go:562 msg="Starting TSDB ..." 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204059097Z caller=main.go:572 msg="TSDB started" 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204101343Z caller=main.go:632 msg="Loading configuration file" filename=/app/prometheus/cf...metheus.yml 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204905309Z caller=main.go:658 msg="Completed loading of configuration file" filename=/app/...metheus.yml 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204919014Z caller=main.go:531 msg="Server is ready to receive web requests." 12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.20493548Z caller=web.go:399 component=web msg="Start listening for connections" address=0.0.0.0:9090 Hint: Some lines were ellipsized, use -l to show in full.
wget https://github.com/prometheus/node_exporter/releases/download/v0.17.0/node_exporter-0.17.0.linux-amd64.tar.gz
tar -xvf node_exporter-0.17.0.linux-amd64.tar.gz -C /app/prometheus/ cd /app/prometheus/ mv node_exporter-0.17.0.linux-amd64 node_exporter
groupadd prometheus useradd -g prometheus prometheus -d /app/prometheus chown -R prometheus.prometheus node_exporter
# vim /usr/lib/systemd/system/node_exporter.service [Unit] Description=node_exporter Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/app/prometheus/node_exporter/node_exporter Restart=on-failure [Install] WantedBy=multi-user.target
systemctl daemon-reload systemctl enable node_exporter.service systemctl start node_exporter.service
]# systemctl status node_exporter.service ● node_exporter.service - node_exporter Loaded: loaded (/usr/lib/systemd/system/node_exporter.service; enabled; vendor preset: disabled) Active: active (running) since 日 2018-12-09 22:45:10 CST; 4min 8s ago Docs: https://prometheus.io/ Main PID: 1515 (node_exporter) CGroup: /system.slice/node_exporter.service └─1515 /app/prometheus/node_exporter/node_exporter 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - sockstat" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - stat" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - textfile" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - time" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - timex" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - uname" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - vmstat" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - xfs" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - zfs" source="node_exporter.go:97" 12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg="Listening on :9100" source="node_exporter.go:111"
訪問:http://172.16.8.253:9100/metrics,查看從exporter具體能抓到的數據.以下:linux
wget https://dl.grafana.com/oss/release/grafana-5.4.0-1.x86_64.rpm yum localinstall grafana-5.4.0-1.x86_64.rpm
systemctl daemon-reload systemctl enable grafana-server.service systemctl start grafana-server.service
默認帳號/密碼:admin/admin
http://172.16.9.253:3000
git
在登錄首頁,點擊"Configuration-Data Sources"按鈕,跳轉到添加數據源頁面,配置以下: Name: prometheus Type: prometheus URL: http://172.16.9.253:9090/ Access: Server 取消Default的勾選,其他默認,點擊"Add",以下:
在"Dashboards"頁簽下"import"自帶的模版,以下:
github
tar -xvf alertmanager-0.15.3.linux-amd64.tar.gz -C /app/prometheus/ cd /app/prometheus/ mv alertmanager-0.15.3.linux-amd64 alertmanager cd alertmanager/ mkdir {bin,cfg,data} mv alertmanager amtool bin/ mv alertmanager.yml cfg/ chown -R prometheus.prometheus *
# vim /usr/lib/systemd/system/alertmanager.service [Unit] Description=alertmanager Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/app/prometheus/alertmanager/bin/alertmanager \ --config.file=/app/prometheus/alertmanager/cfg/alertmanager.yml \ --web.listen-address=172.16.9.201:9093 \ --cluster.listen-address=0.0.0.0:8001 \ --storage.path=/app/prometheus/alertmanager/data \ --log.level=info Restart=on-failure LimitNOFILE=65536 [Install] WantedBy=multi-user.target
systemctl daemon-reload systemctl enable alertmanager.service systemctl start alertmanager.service
# tail -f /var/log/messages Dec 11 10:51:11 prometheus-node2 systemd: Stopping alertmanager... Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.118658711Z caller=main.go:426 msg="Received SIGTERM, exiting gracefully..." Dec 11 10:51:11 prometheus-node2 systemd: Started alertmanager. Dec 11 10:51:11 prometheus-node2 systemd: Starting alertmanager... Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.156033311Z caller=main.go:174 msg="Starting Alertmanager" version="(version=0.15.3, branch=HEAD, revision=d4a7697cc90f8bce62efe7c44b63b542578ec0a1)" Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.156186095Z caller=main.go:175 build_context="(go=go1.11.2, user=root@4ecc17c53d26, date=20181109-15:40:48)" Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.179081721Z caller=cluster.go:155 component=cluster msg="setting advertise address explicitly" addr=172.16.9.202 port=8001 Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.182933235Z caller=main.go:322 msg="Loading configuration file" file=/app/prometheus/alertmanager/cfg/alertmanager.yml Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.1953798Z caller=main.go:398 msg=Listening address=172.16.9.202:9093 Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.203980995Z caller=cluster.go:570 component=cluster msg="Waiting for gossip to settle..." interval=2s Dec 11 10:51:13 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:13.205051348Z caller=cluster.go:595 component=cluster msg="gossip not settled" polls=0 before=0 now=1 elapsed=2.000725532s Dec 11 10:51:21 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:21.208105947Z caller=cluster.go:587 component=cluster msg="gossip settled; proceeding" elapsed=10.003795489s
vim alertmanager.yml global: resolve_timeout: 5m smtp_smarthost: 'smtp.163.com:25' # 郵箱smtp服務器代理 smtp_from: 'xxxxxxx@163.com' # 發送郵箱名稱 smtp_auth_username: 'xxxxxx@163.com' # 郵箱名稱 smtp_auth_password: 'xxxxx' # 郵箱密碼或受權碼 smtp_require_tls: false route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'mail' receivers: - name: 'mail' email_configs: - to: 'xxxxxxxx@qq.com'
### rules配置告警規則 ``` vim qas.yml groups: - name: 主機狀態-監控告警 rules: - alert: 主機狀態 expr: up == 0 for: 1m labels: status: 很是嚴重 annotations: summary: "{{$labels.instance}}:服務器宕機" description: "{{$labels.instance}}:服務器延時超過5分鐘" - alert: CPU使用狀況 expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60 for: 1m labels: status: 通常告警 annotations: summary: "{{$labels.mountpoint}} CPU使用率太高!" description: "{{$labels.mountpoint }} CPU使用大於60%(目前使用:{{$value}}%)" - alert: 內存使用 expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 內存使用率太高!" description: "{{$labels.mountpoint }} 內存使用大於80%(目前使用:{{$value}}%)" - alert: IO性能 expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流入磁盤IO使用率太高!" description: "{{$labels.mountpoint }} 流入磁盤IO大於60%(目前使用:{{$value}})" - alert: 網絡 expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流入網絡帶寬太高!" description: "{{$labels.mountpoint }}流入網絡帶寬持續2分鐘高於100M. RX帶寬使用率{{$value}}" - alert: 網絡 expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流出網絡帶寬太高!" description: "{{$labels.mountpoint }}流出網絡帶寬持續2分鐘高於100M. RX帶寬使用率{{$value}}" - alert: TCP會話 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} TCP_ESTABLISHED太高!" description: "{{$labels.mountpoint }} TCP_ESTABLISHED大於1000%(目前使用:{{$value}}%)" - alert: 磁盤容量 expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 磁盤分區使用率太高!" description: "{{$labels.mountpoint }} 磁盤分區使用大於80%(目前使用:{{$value}}%)"
![](https://s1.51cto.com/images/blog/201906/12/40ef2698ad7ba396dd77678e3e58cbbf.jpg?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=) ### 已經收到郵件內容告警 ![](https://s1.51cto.com/images/blog/201906/12/201c5fe81bdd31ad4277788cb3847b94.jpg?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=)