【10】把 Elasticsearch 當數據庫使：Drill Down 下鑽

時間 2019-11-07

原文原文鏈接

使用 https://github.com/taowen/es-monitor 能夠用 SQL 進行 elasticsearch 的查詢。傳統的 SQL 的語義是每一層查詢是對下一層查詢的綜合，也就是每SELECT一層行數就會變少。好比git

SELECT sum(per_sector) AS total FROM (
    SELECT sector, count(*) AS per_sector FROM (
        SELECT sector, ipo_year FROM symbol
    )
)

Elasticsearch是不支持這樣的嵌套SELECT的。它支持一個更實用的功能，嵌套下鑽（Drill Down）。與傳統SQL的語義正好相反，Elasticsearch的嵌套不是SELECT FROM 而是 SELECT INSIDE，每通過一層SELECT其行數不減小反而增長。因此我把Elasticsearch的這種嵌套查詢不稱之爲SELECT FROM而是SELECT INSIDE以示區別，固然爲了習慣，用SELECT FROM也是支持的。github

SELECT INSIDE all_symbols GROUP BY ipo_year

SQLelasticsearch

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols 
        GROUP BY ipo_year LIMIT 2)
EOF

第一行SELECT定義了all_symbols，第二行SELECT是在第一次查詢的基礎之上進行本身的二次聚合。這樣多層SELECT的好處是每一層能夠定義本身的SELECT字段。好比GROUP BY a,b,c其實作了三次下鑽，可是限於SQL的語法這樣寫是無法給沒一層下鑽的分桶計算指標，可是用上面這種WITH的語法，就能夠每鑽一層計算一層的指標。ui

{"_bucket_path": ["per_ipo_year"], "max_this_year": 54171930444.0, "max_all_times": 522690000000.0, "ipo_year": 2014}
{"_bucket_path": ["per_ipo_year"], "max_this_year": 5416144671.0, "max_all_times": 522690000000.0, "ipo_year": 2015}

Elasticsearchthis

{
  "aggs": {
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }, 
    "ipo_year": {
      "terms": {
        "field": "ipo_year", 
        "size": 2
      }, 
      "aggs": {
        "max_this_year": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }
  }, 
  "size": 0
}

下鑽的概念直接看Elasticsearch的查詢其實更清楚。每下鑽一層，括號就往右深了一層。從某種意義上來講，Elasticsearch的DSL其實更接近聚合的實質。code

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "max_all_times": {
      "value": 522690000000.0
    }, 
    "ipo_year": {
      "buckets": [
        {
          "max_this_year": {
            "value": 54171930444.0
          }, 
          "key": 2014, 
          "doc_count": 390
        }, 
        {
          "max_this_year": {
            "value": 5416144671.0
          }, 
          "key": 2015, 
          "doc_count": 334
        }
      ], 
      "sum_other_doc_count": 2174, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profileip

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2003040000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 9025,
          "next_doc": 162380,
          "match": 0,
          "build_scorer": 28899,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 2523,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.325354000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2740410000ms"
          },
          {
            "name": "BucketCollector: [[max_all_times, ipo_year]]",
            "reason": "aggregation",
            "time": "1.295439000ms"
          }
        ]
      }
    ]
  }
]

SELECT INSIDE 屢次

SQLget

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols 
        GROUP BY ipo_year LIMIT 2);
    WITH per_sector AS (SELECT sector, MAX(market_cap) AS max_this_sector INSIDE per_ipo_year 
        GROUP BY sector LIMIT 2)
EOF

這個和GROUP BY ipo_year, sector實際上是差很少的，區別在於對每一層下鑽均可以選取這一層的指標出來。it

{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 2660000000.0}
{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 5530000000.0}
{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 2740000000.0}
{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 5416144671.0}

Elasticsearchio

{
  "aggs": {
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }, 
    "ipo_year": {
      "terms": {
        "field": "ipo_year", 
        "size": 2
      }, 
      "aggs": {
        "sector": {
          "terms": {
            "field": "sector", 
            "size": 2
          }, 
          "aggs": {
            "max_this_sector": {
              "max": {
                "field": "market_cap"
              }
            }
          }
        }, 
        "max_this_year": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }
  }, 
  "size": 0
}

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 8, 
  "aggregations": {
    "max_all_times": {
      "value": 522690000000.0
    }, 
    "ipo_year": {
      "buckets": [
        {
          "sector": {
            "buckets": [
              {
                "max_this_sector": {
                  "value": 2660000000.0
                }, 
                "key": "Health Care", 
                "doc_count": 104
              }, 
              {
                "max_this_sector": {
                  "value": 5530000000.0
                }, 
                "key": "Finance", 
                "doc_count": 70
              }
            ], 
            "sum_other_doc_count": 216, 
            "doc_count_error_upper_bound": 0
          }, 
          "max_this_year": {
            "value": 54171930444.0
          }, 
          "key": 2014, 
          "doc_count": 390
        }, 
        {
          "sector": {
            "buckets": [
              {
                "max_this_sector": {
                  "value": 2740000000.0
                }, 
                "key": "Finance", 
                "doc_count": 92
              }, 
              {
                "max_this_sector": {
                  "value": 5416144671.0
                }, 
                "key": "Health Care", 
                "doc_count": 92
              }
            ], 
            "sum_other_doc_count": 150, 
            "doc_count_error_upper_bound": 0
          }, 
          "max_this_year": {
            "value": 5416144671.0
          }, 
          "key": 2015, 
          "doc_count": 334
        }
      ], 
      "sum_other_doc_count": 2174, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2576120000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 63193,
          "next_doc": 165400,
          "match": 0,
          "build_scorer": 29019,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 3205,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "6.292688000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2599140000ms"
          },
          {
            "name": "BucketCollector: [[max_all_times, ipo_year]]",
            "reason": "aggregation",
            "time": "5.172211000ms"
          }
        ]
      }
    ]
  }
]

相信看過前面例子，你應該理解了什麼叫下鑽了。每下鑽一層，前面的一行就會被再次分裂到多個桶裏。每一層均可以搞本身的指標計算。