一、下載及解壓安裝包html
cd /usr/local/src/ export VER="2.13.1" wget https://github.com/prometheus/prometheus/releases/download/v${VER}/prometheus-${VER}.linux-amd64.tar.gz mkdir -p /data0/prometheus groupadd prometheus useradd -g prometheus prometheus -d /data0/prometheus tar -xvf prometheus-${VER}.linux-amd64.tar.gz cd /usr/local/src/ mv prometheus-${VER}.linux-amd64 /data0/prometheus/prometheus_server cd /data0/prometheus/prometheus_server/ mkdir -p {data,config,logs,bin} mv prometheus promtool bin/ mv prometheus.yml config/ chown -R prometheus.prometheus /data0/prometheus
2 、設置環境變量node
vim /etc/profile PATH=/data0/prometheus/prometheus_server/bin:$PATH:$HOME/bin source /etc/profile
三、檢查配置文件mysql
promtool check config /data0/prometheus/prometheus_server/config/prometheus.yml Checking /data0/prometheus/prometheus_server/config/prometheus.yml SUCCESS: 0 rule files found
四、建立prometheus.service 的 systemd unit 文件linux
sudo tee /etc/systemd/system/prometheus.service <<-'EOF' [Unit] Description=Prometheus Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/data0/prometheus/prometheus_server/bin/prometheus --config.file=/data0/prometheus/prometheus_server/config/prometheus.yml --storage.tsdb.path=/data0/prometheus/prometheus_server/data --storage.tsdb.retention=60d Restart=on-failure [Install] WantedBy=multi-user.target EOF systemctl enable prometheus.service systemctl stop prometheus.service systemctl restart prometheus.service systemctl status prometheus.service
yum install -y epel-release supervisor sudo tee /etc/supervisord.d/prometheus.ini<<-"EOF" [program:prometheus] # 啓動程序的命令; command = /data0/prometheus/prometheus_server/bin/prometheus --config.file=/data0/prometheus/prometheus_server/config/prometheus.yml --storage.tsdb.path=/data0/prometheus/prometheus_server/data --storage.tsdb.retention=60d # 在supervisord啓動的時候也自動啓動; autostart = true # 程序異常退出後自動重啓; autorestart = true # 啓動5秒後沒有異常退出,就看成已經正常啓動了; startsecs = 5 # 啓動失敗自動重試次數,默認是3; startretries = 3 # 啓動程序的用戶; user = prometheus # 把stderr重定向到stdout,默認false; redirect_stderr = true # 標準日誌輸出; stdout_logfile=/data0/prometheus/prometheus_server/logs/out-prometheus.log # 錯誤日誌輸出; stderr_logfile=/data0/prometheus/prometheus_server/logs/err-prometheus.log # 標準日誌文件大小,默認50MB; stdout_logfile_maxbytes = 20MB # 標準日誌文件備份數; stdout_logfile_backups = 20 EOF systemctl daemon-reload systemctl enable supervisord systemctl stop supervisord systemctl restart supervisord supervisorctl restart prometheus supervisorctl status
五、prometheus.yml配置文件git
#建立Alertmanager告警規則文件 mkdir -p /data0/prometheus/prometheus_server/rules/ touch /data0/prometheus/prometheus_server/rules/node_down.yml touch /data0/prometheus/prometheus_server/rules/memory_over.yml touch /data0/prometheus/prometheus_server/rules/disk_over.yml touch /data0/prometheus/prometheus_server/rules/cpu_over.yml #prometheus配置文件 cat > /data0/prometheus/prometheus_server/config/prometheus.yml << \EOF # my global config global: scrape_interval: 15s # 設置抓取(pull)時間間隔,默認是1m evaluation_interval: 15s # 設置rules評估時間間隔,默認是1m # scrape_timeout is set to the global default (10s). # 告警管理配置,默認配置 alerting: alertmanagers: - static_configs: - targets: - 192.168.56.11:9093 # 這裏修改成 alertmanagers 的地址 # 加載rules,並根據設置的時間間隔按期評估 rule_files: # - "first_rules.yml" # - "second_rules.yml" - "/data0/prometheus/prometheus_server/rules/node_down.yml" # 實例存活報警規則文件 - "/data0/prometheus/prometheus_server/rules/memory_over.yml" # 內存報警規則文件 - "/data0/prometheus/prometheus_server/rules/disk_over.yml" # 磁盤報警規則文件 - "/data0/prometheus/prometheus_server/rules/cpu_over.yml" # cpu報警規則文件 # 抓取(pull),即監控目標配置 # 默認只有主機自己的監控配置 scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. # 可覆蓋全局配置設置的抓取間隔,由15秒重寫成5秒。 scrape_interval: 10s static_configs: - targets: ['localhost:9090', 'localhost:9100'] - job_name: 'DMC_HOST' file_sd_configs: - files: ['./hosts.json'] # 被監控的主機,能夠經過static_configs羅列全部機器,這裏經過file_sd_configs參數加載文件的形式讀取 # 被監控的主機,能夠json或yaml格式書寫,我這裏以json格式書寫,target裏面寫監控機器的ip,labels非必須,能夠由你本身定 EOF #file_sd_configs參數形式配置主機列表 cat > /data0/prometheus/prometheus_server/config/hosts.json << \EOF [ { "targets": [ "192.168.56.11:9100", "192.168.56.12:9100", "192.168.56.13:9100" ], "labels": { "service": "db_node" } }, { "targets": [ "192.168.56.14:9100", "192.168.56.15:9100", "192.168.56.16:9100" ], "labels": { "service": "web_node" } } ] EOF # 服務器存活報警 cat > /data0/prometheus/prometheus_server/rules/node_down.yml <<\EOF groups: - name: 實例存活告警規則 rules: - alert: 實例存活告警 expr: up == 0 for: 1m labels: user: prometheus severity: warning annotations: description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." EOF # mem報警 cat > /data0/prometheus/prometheus_server/rules/memory_over.yml <<\EOF groups: - name: 內存報警規則 rules: - alert: 內存使用率告警 expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: user: prometheus severity: warning annotations: description: "服務器: 內存使用超過80%!(當前值: {{ $value }}%)" EOF # disk報警 cat > /data0/prometheus/prometheus_server/rules/disk_over.yml <<\EOF groups: - name: 磁盤報警規則 rules: - alert: 磁盤使用率告警 expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80 for: 1m labels: user: prometheus severity: warning annotations: description: "服務器: 磁盤設備: 使用超過80%!(掛載點: {{ $labels.mountpoint }} 當前值: {{ $value }}%)" EOF # cpu報警 cat > /data0/prometheus/prometheus_server/rules/cpu_over.yml <<\EOF groups: - name: CPU報警規則 rules: - alert: CPU使用率告警 expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90 for: 1m labels: user: prometheus severity: warning annotations: description: "服務器: CPU使用超過90%!(當前值: {{ $value }}%)" EOF
六、查看uigithub
Prometheus自帶有簡單的UI, http://192.168.56.11:9090/golang
http://192.168.56.11:9090/targets http://192.168.56.11:9090/graph
一、下載及解壓安裝包web
cd /usr/local/src/ export VER="0.18.1" wget https://github.com/prometheus/node_exporter/releases/download/v${VER}/node_exporter-${VER}.linux-amd64.tar.gz mkdir -p /data0/prometheus groupadd prometheus useradd -g prometheus prometheus -d /data0/prometheus tar -xvf node_exporter-${VER}.linux-amd64.tar.gz cd /usr/local/src/ mv node_exporter-${VER}.linux-amd64 /data0/prometheus/node_exporter chown -R prometheus.prometheus /data0/prometheus
二、建立node_exporter.service的 systemd unit 文件redis
cat > /usr/lib/systemd/system/node_exporter.service <<EOF [Unit] Description=node_exporter Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/data0/prometheus/node_exporter/node_exporter Restart=on-failure [Install] WantedBy=multi-user.target EOF
cat > /etc/systemd/system/node_exporter.service <<EOF [Unit] Description=node_exporter Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/data0/prometheus/node_exporter/node_exporter Restart=on-failure [Install] WantedBy=multi-user.target EOF
三、啓動服務sql
systemctl daemon-reload systemctl stop node_exporter.service systemctl enable node_exporter.service systemctl restart node_exporter.service
四、運行狀態
systemctl status node_exporter.service
五、客戶監控端數據彙報
訪問:http://192.168.56.11:9100/metrics 查看從exporter具體能抓到的數據.以下:
一、下載及解壓安裝包
cd /usr/local/src/ export VER="0.19.0" wget https://github.com/prometheus/alertmanager/releases/download/v${VER}/alertmanager-${VER}.linux-amd64.tar.gz mkdir -p /data0/prometheus groupadd prometheus useradd -g prometheus prometheus -d /data0/prometheus tar -xvf alertmanager-${VER}.linux-amd64.tar.gz cd /usr/local/src/ mv alertmanager-${VER}.linux-amd64 /data0/prometheus/alertmanager chown -R prometheus.prometheus /data0/prometheus
二、配置Alertmanager
alertmanager的webhook集成了釘釘報警,釘釘機器人對文件格式有嚴格要求,因此必須經過特定的格式轉換,才能發送給你釘釘的機器人。有人已經寫了轉換插件,那就直接用吧(https://github.com/timonwong/prometheus-webhook-dingtalk.git)
cat >/data0/prometheus/alertmanager/alertmanager.yml<<-"EOF" # 全局配置項 global: resolve_timeout: 5m # 處理超時時間,默認爲5min # 定義路由樹信息 route: group_by: [alertname] # 報警分組依據 receiver: ops_notify # 設置默認接收人 group_wait: 30s # 最初即第一次等待多久時間發送一組警報的通知 group_interval: 60s # 在發送新警報前的等待時間 repeat_interval: 1h # 重複發送告警時間。默認1h routes: - receiver: ops_notify # 基礎告警通知 group_wait: 10s match_re: alertname: 實例存活告警|磁盤使用率告警 # 匹配告警規則中的名稱發送 - receiver: info_notify # 消息告警通知 group_wait: 10s match_re: alertname: 內存使用率告警|CPU使用率告警 # 定義基礎告警接收者 receivers: - name: ops_notify webhook_configs: - url: http://localhost:8060/dingtalk/ops_dingding/send send_resolved: true # 警報被解決以後是否通知 # 定義消息告警接收者 - name: info_notify webhook_configs: - url: http://localhost:8060/dingtalk/info_dingding/send send_resolved: true # 一個inhibition規則是在與另外一組匹配器匹配的警報存在的條件下,使匹配一組匹配器的警報失效的規則。兩個警報必須具備一組相同的標籤。 inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance'] EOF
三、啓動alertmanager
cat >/lib/systemd/system/alertmanager.service<<\EOF [Unit] Description=Prometheus: the alerting system Documentation=http://prometheus.io/docs/ After=prometheus.service [Service] ExecStart=/data0/prometheus/alertmanager/alertmanager --config.file=/data0/prometheus/alertmanager/alertmanager.yml Restart=always StartLimitInterval=0 RestartSec=10 [Install] WantedBy=multi-user.target EOF systemctl enable alertmanager.service systemctl stop alertmanager.service systemctl restart alertmanager.service systemctl status alertmanager.service #查看端口 netstat -anpt | grep 9093
四、將釘釘接入 Prometheus AlertManager WebHook
#命令行測試機器人發送消息,驗證是否能夠發送成功,有的時候prometheus-webhook-dingtalk會報422的錯誤,就是由於釘釘的安全限制(這裏的安全策略是發送消息,必須包含prometheus才能夠正常發送) curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=18f977769d50518e9d4f99a0d5dc1376f05615b61ea3639a87f106459f75b5c9 curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=11a0496d0af689d56a5861ae34dc47d9f1607aee6f342747442cc83e36715223
cd /usr/local/src/ export VER="0.3.0" wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v${VER}/prometheus-webhook-dingtalk-${VER}.linux-amd64.tar.gz tar -zxvf prometheus-webhook-dingtalk-${VER}.linux-amd64.tar.gz mv prometheus-webhook-dingtalk-${VER}.linux-amd64 /data0/prometheus/alertmanager/prometheus-webhook-dingtalk #使用方法:prometheus-webhook-dingtalk --ding.profile=釘釘接收羣組的值=webhook的值 cat > /etc/systemd/system/prometheus-webhook-dingtalk.service<<\EOF [Unit] Description=prometheus-webhook-dingtalk After=network-online.target [Service] Restart=on-failure ExecStart=/data0/prometheus/alertmanager/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \ --ding.profile=ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=18f977769d50518e9d4f99a0d5dc1376f05615b61ea3639a87f106459f75b5c9 \ --ding.profile=info_dingding=https://oapi.dingtalk.com/robot/send?access_token=11a0496d0af689d56a5861ae34dc47d9f1607aee6f342747442cc83e36715223 [Install] WantedBy=multi-user.target EOF systemctl daemon-reload systemctl stop prometheus-webhook-dingtalk systemctl restart prometheus-webhook-dingtalk systemctl status prometheus-webhook-dingtalk netstat -nltup|grep 8060
docker pull timonwong/prometheus-webhook-dingtalk:v0.3.0 #docker run -d --restart always -p 8060:8060 timonwong/prometheus-webhook-dingtalk:v0.3.0 --ding.profile="<web-hook-name>=<dingtalk-webhook>" docker run -d --restart always -p 8060:8060 timonwong/prometheus-webhook-dingtalk:v0.3.0 --ding.profile="ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=18f977769d50518e9d4f99a0d5dc1376f05615b61ea3639a87f106459f75b5c9" --ding.profile="info_dingding=https://oapi.dingtalk.com/robot/send?access_token=11a0496d0af689d56a5861ae34dc47d9f1607aee6f342747442cc83e36715223" 這裏解釋一下兩個變量: <web-hook-name> :prometheus-webhook-dingtalk 支持多個釘釘 webhook,不一樣 webhook 就是靠名字對應到 URL 來作映射的。要支持多個釘釘 webhook,能夠用多個 --ding.profile 參數的方式支持,例如:sudo docker run -d --restart always -p 8060:8060 timonwong/prometheus-webhook-dingtalk:v0.3.0 --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=token1" --ding.profile="webhook2=https://oapi.dingtalk.com/robot/send?access_token=token2"。而名字和 URL 的對應規則以下,ding.profile="webhook1=......",對應的 API URL 爲:http://localhost:8060/dingtalk/webhook1/send <dingtalk-webhook>:這個就是以前獲取的釘釘 webhook
#安裝golang環境 cd /usr/local/src/ wget https://dl.google.com/go/go1.13.4.linux-amd64.tar.gz tar -zxvf go1.13.4.linux-amd64.tar.gz mv go/ /usr/local/ #vim /etc/profile export GOROOT=/usr/local/go export PATH=$PATH:$GOROOT/bin #添加環境變量GOPATH mkdir -p /opt/path export GOPATH=/opt/path #若 $GOPATH/bin 沒有加入$PATH中,你須要執行將其可執行文件移動到$GOBIN下 export GOPATH=/opt/path export PATH=$PATH:$GOROOT/bin:$GOPATH/bin source /etc/profile #下載插件 cd /usr/local/src/ git clone https://github.com/timonwong/prometheus-webhook-dingtalk.git cd prometheus-webhook-dingtalk go get github.com/timonwong/prometheus-webhook-dingtalk/cmd/prometheus-webhook-dingtalk make #(make成功後,會產生一個prometheus-webhook-dingtalk二進制文件) #將釘釘告警插件拷貝到alertmanager目錄 cp prometheus-webhook-dingtalk /data0/prometheus/alertmanager/ #啓動服務 nohup /data0/prometheus/alertmanager/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --ding.profile="ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=18f977769d50518e9d4f99a0d5dc1376f05615b61ea3639a87f106459f75b5c9" --ding.profile="info_dingding=https://oapi.dingtalk.com/robot/send?access_token=11a0496d0af689d56a5861ae34dc47d9f1607aee6f342747442cc83e36715223" 2>&1 1>/tmp/dingding.log & #檢查端口 netstat -anpt | grep 8060
一、下載及安裝
cd /usr/local/src/ export VER="6.4.3" wget https://dl.grafana.com/oss/release/grafana-${VER}-1.x86_64.rpm yum localinstall -y grafana-${VER}-1.x86_64.rpm
二、啓動服務
systemctl daemon-reload systemctl enable grafana-server.service systemctl stop grafana-server.service systemctl restart grafana-server.service
三、訪問WEB界面
默認帳號/密碼:admin/admin http://192.168.56.11:3000
四、Grafana添加數據源
在登錄首頁,點擊"Configuration-Data Sources"按鈕,跳轉到添加數據源頁面,配置以下: Name: prometheus Type: prometheus URL: http://192.168.56.11:9090 Access: Server 取消Default的勾選,其他默認,點擊"Add",以下: 須要安裝餅圖的插件 grafana-cli plugins install grafana-piechart-panel systemctl restart grafana-server.service 請確保安裝後能正常添加餅圖。 安裝consul數據源插件 grafana-cli plugins install sbueringer-consul-datasource systemctl restart grafana-server.service
https://grafana.com/dashboards
https://grafana.com/grafana/dashboards/11074 基礎監控-new https://grafana.com/dashboards/8919 基礎監控 https://grafana.com/dashboards/7362 數據庫監控
參考文檔:
https://www.jianshu.com/p/e59cfd15612e Centos 7 部署 Prometheus、Alertmanager、Grafana 監控 Linux 主機
https://juejin.im/entry/5c2c4a7f6fb9a049b82a90ee 使用 Prometheus 監控 Ceph
http://www.javashuo.com/article/p-kcxjmuso-mg.html CentOS7.5 Prometheus2.5+Grafana5.4監控部署
http://www.javashuo.com/article/p-vxfsodwe-dn.html Grafana+Prometheus打造全方位立體監控系統
https://www.cnblogs.com/sfnz/p/6566951.html安裝prometheus+grafana監控mysql redis kubernetes等
https://blog.csdn.net/hzs33/article/details/86553259 prometheus+grafana監控mysql、canal服務器