1、概述html
線上部署的k8s已經扛過了雙11的洗禮,期間先是經過對網絡和監控的優化順利度過了雙11而且表現良好。先簡單介紹一下咱們kubernetes的使用方式:java
物理機系統:Ubuntu-16.04(kernel 升級到4.17)node
kuberneets-version:1.13.2git
網絡組件:calico(採用的是BGP模式+bgp reflector)github
kube-proxy:使用的是ipvs模式golang
監控:prometheus+grafanaweb
日誌: fluentd + ESdocker
metrics: metrics-serverjson
HPA:cpu + memorybootstrap
告警:釘釘
CI/CD: gitlab-ci/gitlab-runner
應用管理工具:helm、chartmuseum(不建議直接使用helm,helm charts可讀性不好,學習成本較高)
因爲k8s、物理環境共存,須要打統統網絡提供訪問:kube-gateway
有的地方涉及到公司內部的東西不方便寫出來,可是絕大部分在我以前的博客都有介紹,有興趣的能夠參考一下。
本身的反思:
開始的時候,k8s集羣在線上跑了一段時間,可是我發現我對集羣內部的變化沒有辦法把控的很清楚,好比某個pod被從新調度了、某個node節點上的imagegc失敗了、某個hpa被觸發了等等,而這些都是能夠經過events拿到的,可是events並非永久存儲的,它包含了集羣各類資源的狀態變化,因此咱們能夠經過收集分析events來了解整個集羣內部的變化,通過一番探索找到一個開源的eventrouter來收集events事件,通過一些改造使其符合咱們的業務場景,改名爲eventrouter-kafka(https://github.com/cuishuaigit/eventrouter-kafka)直接將修改配置直傳kafka,而不是須要各類配置,感受原版的配置有些繁瑣不是很好用,而咱們的日誌也是走kafka隊列的,減輕ES的寫壓力。如今的events收集流程:
eventrouter---->kafka---->logstash(過濾、解析)----->ES------elastalert---->釘釘
通過添加上面的收集events使k8s集羣又完善了一步。
2、簡述流程
一、部署eventrouter
eventrouter是使用golang寫的,能夠根據本身的需求二次開發,部署很簡單,參考:https://github.com/cuishuaigit/eventrouter-kafka。這裏就不細述了。
二、kafka集羣
參考:https://github.com/cuishuaigit/k8s-kafka
三、logstash
如今相應版本的logstash,下載地址:https://www.elastic.co/guide/en/logstash/6.5/installing-logstash.html
而後進行配置,這裏貼一下個人測試配置:
input{ kafka{ bootstrap_servers => ["kafka-0.kafka-svc.kafka.svc.cluster.local:9092,kafka-1.kafka-svc.kafka.svc.cluster.local:9092,kafka-2.kafka-svc.kafka.svc.cluster.local:9092"] client_id => "eventrouter-prod" #auto_offset_reset => "latest" group_id => "eventrouter" consumer_threads => 2 #decorate_events => true
id => "eventrouter" topics => ["eventrouter"] } } filter { if [message] =~ 'DNSConfigForming' { drop { } } json { source => "message" } mutate { remove_field => [ "message","old_event" ] } } output{ elasticsearch { hosts => "10.4.9.28:9200" index => "eventrouter-%{+YYYY-MM-dd}" } }
四、ES
version: '2' services: elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:6.5.1 container_name: elasticsearch environment: - cluster.name=docker-cluster - bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms4096m -Xmx4096m" ulimits: memlock: soft: -1 hard: -1 volumes: - /data/es1:/usr/share/elasticsearch/data - /data/backups:/usr/share/elasticsearch/backups - /data/longterm_backups:/usr/share/elasticsearch/longterm_backups - ./config/jvm.options:/usr/share/elasticsearch/config/jvm.options ports: - "9200:9200" networks: - esnet # elasticsearch2: # image: docker.elastic.co/elasticsearch/elasticsearch:6.5.1 # container_name: elasticsearch2 # environment: # - cluster.name=docker-cluster # - bootstrap.memory_lock=true # - "ES_JAVA_OPTS=-Xms512m -Xmx512m" # - "discovery.zen.ping.unicast.hosts=elasticsearch" # ulimits: # memlock: # soft: -1 # hard: -1 # volumes: # - /data/es2:/usr/share/elasticsearch/data # networks: # - esnet kibana: image: docker.elastic.co/kibana/kibana:6.5.1 container_name: kibana environment: SERVER_NAME: kibana SERVER_HOST: "0.0.0.0" ELASTICSEARCH_URL: http://elasticsearch:9200
XPACK_MONITORING_UI_CONATINER_ELASTICSEARCH_ENABLED: "true" volumes: - /data/plugin:/usr/share/kibana/plugin - /tmp/:/etc/archives ports: - "5601:5601" networks: - esnet depends_on: - elasticsearch networks: esnet: driver: bridge
cat config/jvm.properties
## JVM configuration ################################################################ ## IMPORTANT: JVM heap size ################################################################ ## ## You should always set the min and max JVM heap ## size to the same value. For example, to set ## the heap to 4 GB, set: ## ## -Xms4g ## -Xmx4g ## ## See https://www.elastic.co/guide/en/elasticsearch/reference/current/heap-size.html
## for more information ## ################################################################ # Xms represents the initial size of total heap space # Xmx represents the maximum size of total heap space -Xms2g -Xmx2g ################################################################ ## Expert settings ################################################################ ## ## All settings below this section are considered ## expert settings. Don't tamper with them unless
## you understand what you are doing ## ################################################################ ## GC configuration -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75
-XX:+UseCMSInitiatingOccupancyOnly ## G1GC Configuration # NOTE: G1GC is only supported on JDK version 10 or later. # To use G1GC uncomment the lines below. # 10-:-XX:-UseConcMarkSweepGC # 10-:-XX:-UseCMSInitiatingOccupancyOnly # 10-:-XX:+UseG1GC # 10-:-XX:InitiatingHeapOccupancyPercent=75 ## optimizations # pre-touch memory pages used by the JVM during initialization -XX:+AlwaysPreTouch ## basic # explicitly set the stack size -Xss1m # set to headless, just in case
-Djava.awt.headless=true # ensure UTF-8 encoding by default (e.g. filenames) -Dfile.encoding=UTF-8 # use our provided JNA always versus the system one -Djna.nosys=true # turn off a JDK optimization that throws away stack traces for common # exceptions because stack traces are important for debugging -XX:-OmitStackTraceInFastThrow # flags to configure Netty -Dio.netty.noUnsafe=true
-Dio.netty.noKeySetOptimization=true
-Dio.netty.recycler.maxCapacityPerThread=0 # log4j 2
-Dlog4j.shutdownHookEnabled=false
-Dlog4j2.disable.jmx=true
-Djava.io.tmpdir=${ES_TMPDIR} ## heap dumps # generate a heap dump when an allocation from the Java heap fails # heap dumps are created in the working directory of the JVM -XX:+HeapDumpOnOutOfMemoryError # specify an alternative path for heap dumps; ensure the directory exists and # has sufficient space -XX:HeapDumpPath=data # specify an alternative path for JVM fatal error logs -XX:ErrorFile=logs/hs_err_pid%p.log ## JDK 8 GC logging 8:-XX:+PrintGCDetails 8:-XX:+PrintGCDateStamps 8:-XX:+PrintTenuringDistribution 8:-XX:+PrintGCApplicationStoppedTime 8:-Xloggc:logs/gc.log 8:-XX:+UseGCLogFileRotation 8:-XX:NumberOfGCLogFiles=32
8:-XX:GCLogFileSize=64m # JDK 9+ GC logging 9-:-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m # due to internationalization enhancements in JDK 9 Elasticsearch need to set the provider to COMPAT otherwise # time/date parsing will break in an incompatible way for some date patterns and locals 9-:-Djava.locale.providers=COMPAT # temporary workaround for C2 bug with JDK 10 on hardware with AVX-512
10-:-XX:UseAVX=2
五、elastalert
部署參考https://github.com/Yelp/elastalert.git
使用:
mkdir /etc/elastalert
將clone的elastalert目錄下面的config.yaml.example拷貝到上面建立的目錄裏面:
cpoy elastalert/config.yaml.example /etc/elastalert/config.yaml
只須要修改:
rules_folder、es_host、es_port,若是設置了用戶密碼,還須要修改。
建立rules
mkdir /etc/elastalert/rules
六、釘釘
建立機器人蔘考我其餘的博客,獲取token,下載釘釘plugin, https://github.com/xuyaoqiang/elastalert-dingtalk-plugin
將elastalert_modules拷貝到/etc/elastalert目錄下面
cp -r elastalert-dingtalk-plugin/elastalert_modules /etc/elastalert/elastalert
rules example
# Alert when the rate of events exceeds a threshold # (Optional) # Elasticsearch host es_host: 10.2.9.28 # (Optional) # Elasticsearch port es_port: 9200 # (OptionaL) Connect with SSL to Elasticsearch #use_ssl: True # (Optional) basic-auth username and password for Elasticsearch #es_username: someusername #es_password: somepassword # (Required) # Rule name, must be unique name: Other event frequency rule # (Required) # Type of alert. # the frequency rule type alerts when num_events events occur with timeframe time type: frequency # (Required) # Index to search, wildcard supported index: eventrouter-* # (Required, frequency specific) # Alert when this many documents matching the query occur within a timeframe num_events: 5 # (Required, frequency specific) # num_events must occur within this amount of time to trigger an alert timeframe: #hours: 4 minutes: 15 # (Required) # A list of Elasticsearch filters used for find events # These filters are joined with AND and nested in a filtered query # For more info: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl.html
filter: #- term: # some_field: "some_value"
- query: query_string: query: "event.type: Warning NOT event.involvedObject.kind: Node" # (Required) # The alert is use when a match is found #smtp_host: smtp.exmail.qq.com #smtp_port: 25 #smtp_auth_file: /etc/elastalert/smtp_auth_file.yaml #email_reply_to: ci@qq.com #from_addr: ci@qq.com realert: minutes: 5 exponential_realert: hours: 1 alert: #- "email"
- "elastalert_modules.dingtalk_alert.DingTalkAlerter" dingtalk_webhook: "https://oapi.dingtalk.com/robot/send?access_token=47194e6904c6e3133a9080980984444c8e5d7745e1f76c12cefa99c8c8ac718dd88d4c" dingtalk_msgtype: "text" alert_text_type: alert_text_only alert_text: " ====elastalert message====\n EventTime>>: {0}\n Event_involvedObject_name>>: {1}\n Event_involvedObject_kind>>: {2}\n Event_involvedObject_namespace>>: {3}\n Message>>: {4}\n Event_reason>>: {5}\n verb>>: {6} " alert_text_args: - "@timestamp"
- event.involvedObject.name - event.source.component - event.involvedObject.namespace - event.message - event.reason - verb # (required, email specific) # a list of email addresses to send alerts to #email: #- "ci@qq.com"
本身定製的告警消息格式:
alert: #- "email"
- "elastalert_modules.dingtalk_alert.DingTalkAlerter" dingtalk_webhook: "https://oapi.dingtalk.com/robot/send?access_token=47194e6904c6e3133a9080980984444c8e5d7745e1f76c12cefa99c8c8ac718dd88d4c" dingtalk_msgtype: "text" alert_text_type: alert_text_only alert_text: " ====elastalert message====\n EventTime>>: {0}\n Event_involvedObject_name>>: {1}\n Event_involvedObject_kind>>: {2}\n Event_involvedObject_namespace>>: {3}\n Message>>: {4}\n Event_reason>>: {5}\n verb>>: {6} " alert_text_args: - "@timestamp"
- event.involvedObject.name - event.source.component - event.involvedObject.namespace - event.message - event.reason - verb
詳細信息參考官網:https://elastalert.readthedocs.io/en/latest/recipes/writing_filters.html#writingfilters