【11】把 Elasticsearch 當數據庫使：Filter 下鑽

時間 2019-11-07

原文原文鏈接

使用 https://github.com/taowen/es-monitor 能夠用 SQL 進行 elasticsearch 的查詢。下鑽未必必定是GROUP BY，每記過一層GROUP BY，桶的數量就會增長一層。還有一種下鑽是用 filter 來下鑽，這種下鑽方式會使下一層的 match的文檔數量變少，可是桶的數量不變。git

SELECT INSIDE WHERE ipo_year=2000

SQLgithub

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH year_2000 AS (SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols 
        WHERE ipo_year=2000)
EOF

{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0}

能夠看到桶的數量仍是同一個，只是增長了一個max_at_2000結果字段。
Elasticsearchsql

{
  "aggs": {
    "year_2000": {
      "filter": {
        "term": {
          "ipo_year": 2000
        }
      }, 
      "aggs": {
        "max_at_2000": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }, 
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }
  }, 
  "size": 0
}

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 3, 
  "aggregations": {
    "year_2000": {
      "max_at_2000": {
        "value": 20310000000.0
      }, 
      "doc_count": 58
    }, 
    "max_all_times": {
      "value": 522690000000.0
    }
  }, 
  "timed_out": false
}

Profileelasticsearch

[
  {
    "query": [
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`P",
        "time": "0.3492430000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 220149,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 95037,
          "advance": 34057
        }
      },
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.1883710000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 3980,
          "next_doc": 169730,
          "match": 0,
          "build_scorer": 14661,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 3158,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "8.789594000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2768050000ms"
          },
          {
            "name": "BucketCollector: [[year_2000, max_all_times]]",
            "reason": "aggregation",
            "time": "7.667765000ms"
          }
        ]
      }
    ]
  }
]

從Profile的結果來看，實際上是一次性作了兩個查詢，一個是所有文檔，一個是ipo_year=2000的文檔，而後統一聚合。ui

SELECT INSIDE 往不一樣方向過濾下鑽

SQLcode

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH year_2000 AS (SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols 
        WHERE ipo_year=2000);
    WITH year_2001 AS (SELECT MAX(market_cap) AS max_at_2001 FROM all_symbols 
        WHERE ipo_year=2001)
EOF

這個寫法其實和 CASE WHEN 很相似，可是其表達能力更強大，更靈活。ip

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    SELECT per_ipo_year, MAX(market_cap) AS max_all_times FROM symbol 
        GROUP BY CASE 
            WHEN ipo_year=2000 THEN 'year_2000' 
            WHEN ipo_year=2001 THEN 'year_2001' 
        END AS per_ipo_year
EOF

上面的sql查詢結果是文檔

{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0, "max_at_2001": 8762940000.0}

Elasticsearchget

{
  "aggs": {
    "year_2001": {
      "filter": {
        "term": {
          "ipo_year": 2001
        }
      }, 
      "aggs": {
        "max_at_2001": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }, 
    "year_2000": {
      "filter": {
        "term": {
          "ipo_year": 2000
        }
      }, 
      "aggs": {
        "max_at_2000": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }, 
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }
  }, 
  "size": 0
}

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "year_2001": {
      "max_at_2001": {
        "value": 8762940000.0
      }, 
      "doc_count": 38
    }, 
    "year_2000": {
      "max_at_2000": {
        "value": 20310000000.0
      }, 
      "doc_count": 58
    }, 
    "max_all_times": {
      "value": 522690000000.0
    }
  }, 
  "timed_out": false
}

Profileit

[
  {
    "query": [
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`Q",
        "time": "0.2518270000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 186032,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 48664,
          "advance": 17131
        }
      },
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`P",
        "time": "0.1200760000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 77254,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 25184,
          "advance": 17638
        }
      },
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.1968800000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 3573,
          "next_doc": 180136,
          "match": 0,
          "build_scorer": 13171,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 4250,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.459413000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2160950000ms"
          },
          {
            "name": "BucketCollector: [[year_2001, year_2000, max_all_times]]",
            "reason": "aggregation",
            "time": "1.455703000ms"
          }
        ]
      }
    ]
  }
]

SELECT INSIDE 往同一方向連續過濾下鑽

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH SELECT MAX(market_cap) AS max_all_times FROM symbol AS all_symbols;
    WITH SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols 
        WHERE ipo_year=2000 AS year_2000;
    WITH SELECT MAX(market_cap) AS max_at_2001_finance FROM year_2000 
        WHERE sector='Finance' AS year_2000_finance
EOF

{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0, "max_at_2001_finance": 985668354.0}

Elasticsearch

{
  "aggs": {
    "year_2000": {
      "filter": {
        "term": {
          "ipo_year": 2000
        }
      }, 
      "aggs": {
        "max_at_2000": {
          "max": {
            "field": "market_cap"
          }
        }, 
        "year_2000_finance": {
          "filter": {
            "term": {
              "sector": "Finance"
            }
          }, 
          "aggs": {
            "max_at_2001_finance": {
              "max": {
                "field": "market_cap"
              }
            }
          }
        }
      }
    }, 
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }
  }, 
  "size": 0
}

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "year_2000": {
      "max_at_2000": {
        "value": 20310000000.0
      }, 
      "year_2000_finance": {
        "max_at_2001_finance": {
          "value": 985668354.0
        }, 
        "doc_count": 2
      }, 
      "doc_count": 58
    }, 
    "max_all_times": {
      "value": 522690000000.0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`P",
        "time": "0.1897790000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 145762,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 26216,
          "advance": 17801
        }
      },
      {
        "query_type": "TermQuery",
        "lucene": "sector:Finance",
        "time": "0.2380290000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 57770,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 55497,
          "advance": 124762
        }
      },
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.1965630000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 3500,
          "next_doc": 178347,
          "match": 0,
          "build_scorer": 14716,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 4190,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.466917000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2712430000ms"
          },
          {
            "name": "BucketCollector: [[year_2000, max_all_times]]",
            "reason": "aggregation",
            "time": "1.370663000ms"
          }
        ]
      }
    ]
  }
]

有了 GROUP BY 下鑽和 FILTER 下鑽，不少複雜的查詢能夠一條就查詢出來。而這種邊下鑽邊聚合指標的查詢能力甚至是傳統SQL都不具有的。並且稍微訓練一下，就會很是習慣這種下鑽的思惟方式，寫查詢也會很天然。