elasticsearch-cn-out-of-box

時間 2019-11-06
標籤 elasticsearch box 欄目日誌分析简体版
原文原文鏈接
elasticsearch-cn-out-of-box

https://github.com/hangxin1940/elasticsearch-cn-out-of-box
爲elasticsearch集成一些實用插件以及配置的開箱即用的版本。node
======git
elasticsearch 1.4.2
servicewrapper 0.90
站點插件:

分詞插件

其餘插件

jetty oob-1.4.2
mapper-attachments 2.4.1
爲 inquisitor 插件增長自定義分析器的預覽等

使用方法

瀏覽器進入插件 http://localhost:9200/_plugin/oobgithub
elasticsearch.yml

# 集羣名
cluster.name: "cn-out-of-box"
# 節點名
node.name: "node1"
# 是否有資格成爲主節點
node.master: true
# 是否存儲索引數據
node.data: true
# 默認索引分片數
index.number_of_shards: 3
# 默認索引副本數
index.number_of_replicas: 1
# 臨時文件存儲路路徑
#path.work: "/tmp/elasticsearch"
# 日誌文件存儲路路徑
#path.logs:  "/var/log/elasticsearch/logs"
# tcp傳輸端口
transport.tcp.port: 9300
# 是否壓縮tcp傳輸數據
transport.tcp.compress: true
# http端口
http.port: 9200
# 是否開啓http服務
#http.enabled: true
# 是否打開多播發現節點
discovery.zen.ping.multicast.enabled: true

# 慢查詢日誌參數
#index.search.slowlog.threshold.query.warn: 10s
#index.search.slowlog.threshold.query.info: 5s
#index.search.slowlog.threshold.query.debug: 2s
#index.search.slowlog.threshold.query.trace: 500ms

#index.search.slowlog.threshold.fetch.warn: 1s
#index.search.slowlog.threshold.fetch.info: 800ms
#index.search.slowlog.threshold.fetch.debug: 500ms
#index.search.slowlog.threshold.fetch.trace: 200ms


# 啓用jetty插件提供http服務
http.type: com.sonian.elasticsearch.http.jetty.JettyHttpServerTransport


# sonian.elasticsearch.http.jetty:
    # ==== 開啓 https
    #ssl_port: 9443
    #config: jetty.xml,jetty-ssl.xml, jetty-gzip.xml
    #keystore_password: "OBF:1nc01vuz1w8f1w1c1rbu1rac1w261w9b1vub1ndq"
    
    # ==== 開啓用戶認證
    # config: jetty.xml,jetty-hash-auth.xml,jetty-restrict-all.xml
    
    

# 索引配置
index:

  # 分析配置
  analysis:
  
    # 分詞器配置  
    tokenizer:

      index_ansj_token:
        type: ansj_index_token
        is_name: false
        is_num: false
        is_quantifier: false

      query_ansj_token:
        type: ansj_query_token
        is_name: false
        is_num: false
        is_quantifier: false

# ======== analysis-pinyin ========
      # 完整拼音
      my_pinyin:
        type: pinyin
        first_letter: prefix
        padding_char: ' '
        
      # 拼音首字母
      pinyin_first_letter:
        type: pinyin
        first_letter: only

# ======== analysis-mmseg ========
      # 簡單正向匹配
      #       example: 一個勁兒的說話
      #       一個
      #       一個勁
      #       一個勁兒
      #       一個勁兒的
      mmseg_simple:
        type: mmseg
        seg_type: simple
        
      # 匹配出全部的「三個詞的詞組」
      # 並使用四種規則消歧(最大匹配、最大平均詞語長度、詞語長度的最小變化率、全部單字詞詞頻的天然對數之和)
      #       example: 研究生命起源
      #       研_究_生
      #       研_究_生命
      #       研究生_命_起源
      #       研究_生命_起源
      mmseg_complex:
        type: mmseg
        seg_type: complex
        
      # 基於complex的最多分詞
      #       example: 中國人民銀行
      #       中國|人民|銀行
      mmseg_maxword:
        type: mmseg
        seg_type: max_word

# ======== analysis-stconvert ========
      # 簡繁轉換，只輸出繁體
      s2t_convert:
        type: stconvert
        delimiter: ","
        convert_type: s2t
        
     # 繁簡轉換，只輸出簡體
      t2s_convert:
        type: stconvert
        delimiter: ","
        convert_type: t2s

     # 簡繁轉換，同時輸出繁體簡體
      s2t_keep_both_convert:
        type: stconvert
        delimiter: ","
        keep_both: 'true'
        convert_type: s2t
        
     # 繁簡轉換，同時輸出簡體繁體
      t2s_keep_both_convert:
        type: stconvert
        delimiter: ","
        keep_both: 'true'
        convert_type: t2s
        
# ======== analysis-pattern ========
     # 正則，分號分詞
      semicolon_spliter:
        type: pattern
        pattern: ";"

     # 正則，%分詞
      pct_spliter:
        type: pattern
        pattern: "[%]+"
 
 # ======== analysis-nGram ========     
      # 1~2字爲一詞
      ngram_1_to_2:
        type: nGram
        min_gram: 1
        max_gram: 2

      # 1~3字爲一詞
      ngram_1_to_3:
        type: nGram
        min_gram: 1
        max_gram: 3

    # 過濾器配置
    filter:

 # ======== ngram filter ========     
      ngram_min_3:
        max_gram: 10
        min_gram: 3
        type: nGram
      ngram_min_2:
        max_gram: 10
        min_gram: 2
        type: nGram
      ngram_min_1:
        max_gram: 10
        min_gram: 1
        type: nGram

 # ======== length filter ========    
      min2_length:
        min: 2
        max: 4
        type: length
      min3_length:
        min: 3
        max: 4
        type: length

 # ======== string2int filter ========   
#      my_string2int:
#        type: string2int
#        redis_server: 127.0.0.1
#        redis_port: 6379
#        redis_key: index1_type2_name2

 # ======== pinyin filter ========  
      pinyin_first_letter:
        type: pinyin
        first_letter: only
        
    # 分析器配置
    analyzer:
    
      lowercase_keyword:
        type: custom
        filter:
        - lowercase
        tokenizer: standard

      lowercase_keyword_ngram_min_size1:
        type: custom
        filter:
        - lowercase
        - stop
        - trim
        - unique
        tokenizer: nGram

      lowercase_keyword_ngram_min_size2:
        type: custom
        filter:
        - lowercase
        - min2_length
        - stop
        - trim
        - unique
        tokenizer: nGram

      lowercase_keyword_ngram_min_size3:
        type: custom
        filter:
        - lowercase
        - min3_length
        - stop
        - trim
        - unique
        tokenizer: ngram_1_to_3

      lowercase_keyword_ngram:
        type: custom
        filter:
        - lowercase        
        - stop
        - trim
        - unique
        tokenizer: ngram_1_to_3

      lowercase_keyword_without_standard:
        type: custom
        filter:
        - lowercase
        tokenizer: keyword

      lowercase_whitespace:
        type: custom
        filter:
        - lowercase
        tokenizer: whitespace

 # ======== ik  ========    
       # ik分詞器
      ik:
        alias:
        - ik_analyzer
        type: org.elasticsearch.index.analysis.IkAnalyzerProvider
        
      # ik智能切分
      ik_max_word:
        type: ik
        use_smart: false
        
      # ik最細粒度切分
      ik_smart:
        type: ik
        use_smart: true

 # ======== mmseg  ========    
       # mmseg分詞器
      mmseg:
        alias:
        - mmseg_analyzer
        type: org.elasticsearch.index.analysis.MMsegAnalyzerProvider
        
      mmseg_maxword:
        type: custom
        filter:
        - lowercase
        tokenizer: mmseg_maxword
        
      mmseg_complex:
        type: custom
        filter:
        - lowercase
        tokenizer: mmseg_complex
        
      mmseg_simple:
        type: custom
        filter:
        - lowercase
        tokenizer: mmseg_simple

 # ======== 正則 ======== 
      comma_spliter:
        type: pattern
        pattern: "[,|\\s]+"
        
      pct_spliter:
        type: pattern
        pattern: "[%]+"
        
        
      custom_snowball_analyzer:
        type: snowball
        language: English
        
      simple_english_analyzer:
        type: custome
        tokenizer: whitespace
        filter:
        - standard
        - lowercase
        - snowball
        
      edge_ngram:
        type: custom
        tokenizer: edgeNGram
        filter:
        - lowercase
 
  # ======== 拼音分析 ========        
      pinyin_ngram_analyzer:
        type: custom
        tokenizer: my_pinyin
        filter:
        - lowercase
        - nGram
        - trim
        - unique

  # ======== 拼音首字母分詞 ========     
      pinyin_first_letter_analyzer:
        type: custom
        tokenizer: pinyin_first_letter
        filter:
        - standard
        - lowercase
 
   # ======== 拼音首字母分詞並過濾 ========
      pinyin_first_letter_keyword_analyzer:
        alias:
        - pinyin_first_letter_analyzer_keyword
        type: custom
        tokenizer: keyword
        filter:
        - pinyin_first_letter
        - lowercase

   # ======== 簡繁體 ========
      stconvert:
        alias:
        - st_analyzer
        type: org.elasticsearch.index.analysis.STConvertAnalyzerProvider
        
      s2t_convert:
        type: stconvert
        delimiter: ","
        convert_type: s2t
        
      t2s_convert:
        type: stconvert
        delimiter: ","
        convert_type: t2s
        
      s2t_keep_both_convert:
        type: stconvert
        delimiter: ","
        keep_both: 'true'
        convert_type: s2t
        
      t2s_keep_both_convert:
        type: stconvert
        delimiter: ","
        keep_both: 'true'
        convert_type: t2s
        
   
      #string2int:
        #type: org.elasticsearch.index.analysis.String2IntAnalyzerProvider
        # redis_server: 127.0.0.1
        # redis_port: 6379
        # redis_key: index1_type1_name1
        
      #custom_string2int:
        #type: custom
        #tokenizer: whitespace
        #filter:
        #- string2int
        #- lowercase
       
      # 路徑分析
      path_analyzer: 
        type: custom
        tokenizer: path_hierarchy
        
# ======== ansj ========
      index_ansj:
        alias:
        - ansj_index_analyzer
        type: ansj_index
        user_path: ansj/user
        ambiguity: ansj/ambiguity.dic
        stop_path: ansj/stopLibrary.dic
        #is_name: false
        # s_num: true 
        #is_quantifier: true
        redis: false
          #pool:
            #maxactive: 20
           # maxidle: 10
            #maxwait: 100
            #testonborrow: true
          #ip: 127.0.0.1:6379
          #channel: ansj_term
          
      query_ansj:
        alias:
        - ansj_query_analyzer
        type: ansj_query
        user_path: ansj/user
        ambiguity: ansj/ambiguity.dic
        stop_path: ansj/stopLibrary.dic
        #is_name: false
        # is_num: true
        # is_quantifier: true
        redis: false
          #pool:
            #maxactive: 20
           # maxidle: 10
            #maxwait: 100
            #testonborrow: true
          #ip: 127.0.0.1:6379
          #channel: ansj_term
          
      uax_url_email: 
        tokenizer: uax_url_email 
        filter: [standard, lowercase, stop] 
 
 # ======== combo ========       
      combo:
        type: combo
        sub_analyzers: 
         - ansj_index
         - ik_smart
         - mmseg_complex
         - uax_url_email
         - s2t_convert
         - t2s_convert
         - smartcn
         - simple_english_analyzer

# 默認分析器
index.analysis.analyzer.default.type: combo
# 線程池設置
threadpool:   
    index:   
        type: fixed   
        size: 30   
        queue: -1   
        reject_policy: caller
更多相關文章...
相關標籤/搜索
日誌分析
每日一句
每一个你不满意的现在，都有一个你没有努力的曾经。