更好的閱讀體驗建議點擊下方原文連接。
原文地址:http://maoqide.live/post/practice/kubernetes-monitoring/node
完整的記錄 kubernetes 監控從部署到配置。
git
https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/prometheusgithub
# change namespace sed -i s/kube-system/monitoring/g * # dynamic provision storage class kubectl create -f prometheus-configmap.yaml kubectl create -f prometheus-rbac.yaml kubectl create -f prometheus-statefulset.yaml kubectl create -f prometheus-service.yaml # kube-metrics-server kubectl create -f kube-state-metrics-deployment.yaml kubectl create -f kube-state-metrics-rbac.yaml kubectl create -f kube-state-metrics-service.yaml # node-exporter kubectl create -f node-exporter-ds.yml kubectl create -f node-exporter-service.yaml
/etc/systemd/system/kubelet.service.d/10-kubeadm.confweb
[Service] Environment="KUBELET_EXTRA_ARGS=--pod-infra-container-image=harbor.guahao-inc.com/kubernetes/pause-amd64:3.1 --hostname-override=172.27.32.165" Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf" Environment="KUBELET_SYSTEM_PODS_ARGS=--pod-manifest-path=/etc/kubernetes/manifests --allow-privileged=true" Environment="KUBELET_NETWORK_ARGS=--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin" Environment="KUBELET_DNS_ARGS=--cluster-dns=10.254.0.10 --cluster-domain=cluster.local" Environment="KUBELET_AUTHZ_ARGS=--authorization-mode=Webhook --client-ca-file=/etc/kubernetes/pki/ca.crt" Environment="KUBELET_CADVISOR_ARGS=--cadvisor-port=0" Environment="KUBELET_CGROUP_ARGS=--cgroup-driver=systemd" Environment="KUBELET_CERTIFICATE_ARGS=--rotate-certificates=true --cert-dir=/var/lib/kubelet/pki" ExecStart= ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_SYSTEM_PODS_ARGS $KUBELET_NETWORK_ARGS $KUBELET_DNS_ARGS $KUBELET_AUTHZ_ARGS $KUBELET_CADVISOR_ARGS $KUBELET_CGROUP_ARGS $KUBELET_CERTIFICATE_ARGS $KUBELET_EXTRA_ARGS
# https://github.com/coreos/prometheus-operator/blob/master/Documentation/troubleshooting.md # all node KUBEADM_SYSTEMD_CONF=/etc/systemd/system/kubelet.service.d/10-kubeadm.conf sed -e "/cadvisor-port=0/d" -i "$KUBEADM_SYSTEMD_CONF" if ! grep -q "authentication-token-webhook=true" "$KUBEADM_SYSTEMD_CONF"; then sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" -i "$KUBEADM_SYSTEMD_CONF" fi systemctl daemon-reload systemctl restart kubelet # master sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml
[Service] Environment="KUBELET_EXTRA_ARGS=--pod-infra-container-image=harbor.guahao-inc.com/kubernetes/pause-amd64:3.1 --hostname-override=172.27.32.165" Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf" Environment="KUBELET_SYSTEM_PODS_ARGS=--pod-manifest-path=/etc/kubernetes/manifests --allow-privileged=true" Environment="KUBELET_NETWORK_ARGS=--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin" Environment="KUBELET_DNS_ARGS=--cluster-dns=10.254.0.10 --cluster-domain=cluster.local" Environment="KUBELET_AUTHZ_ARGS=--authentication-token-webhook=true --authorization-mode=Webhook --client-ca-file=/etc/kubernetes/pki/ca.crt" Environment="KUBELET_CGROUP_ARGS=--cgroup-driver=systemd" Environment="KUBELET_CERTIFICATE_ARGS=--rotate-certificates=true --cert-dir=/var/lib/kubelet/pki" ExecStart= ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_SYSTEM_PODS_ARGS $KUBELET_NETWORK_ARGS $KUBELET_DNS_ARGS $KUBELET_AUTHZ_ARGS $KUBELET_CADVISOR_ARGS $KUBELET_CGROUP_ARGS $KUBELET_CERTIFICATE_ARGS $KUBELET_EXTRA_ARGS
docker run -d -p 3000:3000 --name grafana grafana:grafana # install kubernetes plugin wget https://grafana.com/api/plugins/grafana-kubernetes-app/versions/1.0.1/download unzip grafana-kubernetes-app-31da38a.zip docker cp grafana-kubernetes-app-31da38a/ grafana:/var/lib/grafana/plugins/ docker restart grafana
配置好 grafana 的 kubernetes 地址和證書相關配置,就可以看到 kubernetes 集羣相關的監控圖表信息了。docker
https://github.com/prometheus/prometheus/wiki/FAQ#error-file-already-closed
prometheus /targets 頁面全部監控都是 down 狀態,報錯:
WAL log samples: log series: write /data/wal/000003: file already closed
log series: no space left on device
shell
緣由爲磁盤滿:bootstrap
/data/wal $ lsdata/wal $ pwd
echo > *
清理便可。
prometheus 須要設置合理的 retention 時間保證磁盤空間不會被佔滿。api