MySQL Internals-Index Merge優化 - 心中無碼 - 博客園

時間 2019-11-07

標籤 mysql internals index merge 優化心中博客欄目 MySQL 简体版

原文原文鏈接

MySQL Internals-Index Merge優化

Louis Hust

0 前言

以前搞錯了，覺得Index Merge是MySQL5.6的新特性，原來不是，發現5.5也有，看了下manual，發現5.0的manual就已經存在了，能夠說是一個歷史悠久的優化手段了，好吧，無論怎麼樣，今天就撥開其神祕的面紗，看看其內部到底如何生成這種Index Merge的計劃的。這裏只詳細介紹Intersect操做，對於Union和Sort-Union的具體代碼，還沒開始研究。html

1 Index Merge理論基礎

Index Merge——索引歸併，即針對一張表，同時使用多個索引進行查詢，而後將各個索引查出來的結果進行進一步的操做，能夠是求交 ——Intersect，也能夠是求和——Union，針對union還有一種補充算法——Sort-Union，很奇怪爲何沒有Sort-Intersect，按道理也是能夠作的。mysql

什麼狀況下，同時使用多個索引會有利呢？好比說WHERE條件是C1=10 AND C2 =100，可是隻有分別針對C1和C2的索引，而沒有(C1,C2)這種索引，兩個索引同時使用纔有意義,經過兩個索引均可以快速定位到一批數據，而後對這一批數據進行進一步的求交或求和操做便可,這樣的效率可能比全表掃描或者只使用其中一個索引進行掃描而後再去主索引查詢要快。算法

Intersect和Union都須要使用的索引是ROR的，也就時ROWID ORDERED，即針對不一樣的索引掃描出來的數據必須是同時按照ROWID排序的，這裏的 ROWID其實也就是InnoDB的主鍵(若是不定義主鍵，InnoDB會隱式添加ROWID列做爲主鍵)。只有每一個索引是ROR的，才能進行歸併排序，你懂的。固然你可能會有疑惑，查不記錄後內部進行一次sort不同麼，何須必需要ROR呢，不錯，因此有了SORT-UNION。SORT-UNION就是每一個非ROR的索引排序後再進行Merge。至於爲何沒有SORT-INTERSECT，我也非常迷茫。sql

2 初始化數據

mysql> show create table im\G
*************************** 1. row ***************************
       Table: im
Create Table: CREATE TABLE `im` (
  `c1` int(11) DEFAULT NULL,
  `c2` int(11) DEFAULT NULL,
  `c3` int(11) DEFAULT NULL,
  KEY `c1` (`c1`,`c3`),
  KEY `c2` (`c2`,`c1`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
1 row in set (0.00 sec)

mysql> show create procedure fill_im1\G
*************************** 1. row ***************************
           Procedure: fill_im1
            sql_mode: NO_ENGINE_SUBSTITUTION
    Create Procedure: CREATE DEFINER=`root`@`127.0.0.1` PROCEDURE `fill_im1`(cnt int)
begin declare i int default 0; repeat insert into im values(100, 50, 100); set i=i+1; until i > cnt end repeat; end
character_set_client: utf8
collation_connection: utf8_general_ci
  Database Collation: latin1_swedish_ci
1 row in set (0.07 sec)

mysql> show create procedure fill_im2\G
*************************** 1. row ***************************
           Procedure: fill_im2
            sql_mode: NO_ENGINE_SUBSTITUTION
    Create Procedure: CREATE DEFINER=`root`@`127.0.0.1` PROCEDURE `fill_im2`(cnt int)
begin declare i int default 0; repeat insert into im values(100, 100, 50); set i=i+1; until i > cnt end repeat; end
character_set_client: utf8
collation_connection: utf8_general_ci
  Database Collation: latin1_swedish_ci
1 row in set (0.00 sec)

mysql> call fill_im1(2000)
mysql> call fill_im2(2000)

mysql> insert into im values(100,50,50);
Query OK, 1 row affected (0.00 sec)
mysql> insert into im values(100,50,50);
Query OK, 1 row affected (0.00 sec)

mysql> commit;
Query OK, 0 rows affected (0.05 sec)

mysql> select * from im where c1=100 and c2 = 50 and c3 = 50\G
*************************** 1. row ***************************
c1: 100
c2: 50
c3: 50
*************************** 2. row ***************************
c1: 100
c2: 50
c3: 50
2 rows in set (0.13 sec)

3 執行計劃

mysql> explain select * from im where c1=100 and c2 = 50 and c3 = 50\G
*************************** 1. row ***************************
           id: 1
  select_type: SIMPLE
        table: im
         type: index_merge
possible_keys: c1,c2
          key: c1,c2
      key_len: 10,10
          ref: NULL
         rows: 1001
        Extra: Using intersect(c1,c2); Using where; Using index
1 row in set (0.00 sec)

4 代碼分析

從生成數據的方法能夠看出來，是專門針對查詢的語句進行構造的。不管是根據(c1,c3)的索引查詢仍是根據(c2,c1)的索引查詢，都會查出通常的數據，即效率接近於全表掃描的一半。可是若是利用兩個索引同時進行過濾，那麼過濾出來的數據就不多了,也就是結果中的兩條。shell

也就是說若是單獨查詢各個索引，過濾效果不明顯，可是若是聯合兩個索引進行MERGE過濾，那麼效果可能很明顯，這裏所說的過濾，用更專業的詞來講是選擇因子——selectivity。而計劃的選擇時代價的計算，即是計算這個選擇因子。若是綜合多個索引，致使選擇因子很小，從而達到索引merge出來的結果集很小的話，那麼計劃就更傾向於Index Merge，反之則否則。app

下面是選擇子計算的代碼：less

static double ror_scan_selectivity(const ROR_INTERSECT_INFO *info, const ROR_SCAN_INFO *scan)
{
  double selectivity_mult= 1.0;
  const TABLE * const table= info->param->table;
  const KEY_PART_INFO * const key_part= table->key_info[scan->keynr].key_part;
  /**
    key values tuple, used to store both min_range.key and
    max_range.key. This function is only called for equality ranges;
    open ranges (e.g. "min_value < X < max_value") cannot be used for
    rowid ordered retrieval, so in this function we know that
    min_range.key == max_range.key
  */
  uchar key_val[MAX_KEY_LENGTH+MAX_FIELD_WIDTH];
  uchar *key_ptr= key_val;
  SEL_ARG *sel_arg, *tuple_arg= NULL;
  key_part_map keypart_map= 0;
  bool cur_covered;
  bool prev_covered= test(bitmap_is_set(&info->covered_fields,
                                        key_part->fieldnr-1));
  key_range min_range;
  key_range max_range;
  min_range.key= key_val;
  min_range.flag= HA_READ_KEY_EXACT;
  max_range.key= key_val;
  max_range.flag= HA_READ_AFTER_KEY;
  ha_rows prev_records= table->file->stats.records;
  DBUG_ENTER("ror_scan_selectivity");

  for (sel_arg= scan->sel_arg; sel_arg;
       sel_arg= sel_arg->next_key_part)
  {
    DBUG_PRINT("info",("sel_arg step"));
    cur_covered= test(bitmap_is_set(&info->covered_fields,
                                    key_part[sel_arg->part].fieldnr-1));
    if (cur_covered != prev_covered)
    {
      /* create (part1val, ..., part{n-1}val) tuple. */
      bool is_null_range= false;
      ha_rows records;
      if (!tuple_arg)
      {
        tuple_arg= scan->sel_arg;
        /* Here we use the length of the first key part */
        tuple_arg->store_min(key_part[0].store_length, &key_ptr, 0);
        is_null_range|= tuple_arg->is_null_interval();
        keypart_map= 1;
      }
      while (tuple_arg->next_key_part != sel_arg)
      {
        tuple_arg= tuple_arg->next_key_part;
        tuple_arg->store_min(key_part[tuple_arg->part].store_length,
                             &key_ptr, 0);
        is_null_range|= tuple_arg->is_null_interval();
        keypart_map= (keypart_map << 1) | 1;
      }
      min_range.length= max_range.length= (size_t) (key_ptr - key_val);
      min_range.keypart_map= max_range.keypart_map= keypart_map;

      /* 
        Get the number of rows in this range. This is done by calling
        records_in_range() unless all these are true:
          1) The user has requested that index statistics should be used
             for equality ranges to avoid the incurred overhead of 
             index dives in records_in_range()
          2) The range is not on the form "x IS NULL". The reason is
             that the number of rows with this value are likely to be
             very different than the values in the index statistics
          3) Index statistics is available.
        @see key_val
      */
      if (!info->param->use_index_statistics ||        // (1)
          is_null_range ||                             // (2)
          !(records= table->key_info[scan->keynr].
                     rec_per_key[tuple_arg->part]))    // (3)
      {
        DBUG_EXECUTE_IF("crash_records_in_range", DBUG_SUICIDE(););
        DBUG_ASSERT(min_range.length > 0);
        records= (table->file->
                  records_in_range(scan->keynr, &min_range, &max_range));
      }
      if (cur_covered)
      {
        /* uncovered -> covered */
        double tmp= rows2double(records)/rows2double(prev_records);
        DBUG_PRINT("info", ("Selectivity multiplier: %g", tmp));
        selectivity_mult *= tmp;
        prev_records= HA_POS_ERROR;
      }
      else
      {
        /* covered -> uncovered */
        prev_records= records;
      }
    }
    prev_covered= cur_covered;
  }
  if (!prev_covered)
  {
    double tmp= rows2double(table->quick_rows[scan->keynr]) /
                rows2double(prev_records);
    DBUG_PRINT("info", ("Selectivity multiplier: %g", tmp));
    selectivity_mult *= tmp;
  }
  // Todo: This assert fires in PB sysqa RQG tests.
  // DBUG_ASSERT(selectivity_mult <= 1.0);
  DBUG_PRINT("info", ("Returning multiplier: %g", selectivity_mult));
  DBUG_RETURN(selectivity_mult);
}

剛看到這段代碼時，確實有點犯懵，代碼的註釋給了很大的幫助：ide

/*
  Get selectivity of adding a ROR scan to the ROR-intersection.

  SYNOPSIS
    ror_scan_selectivity()
      info  ROR-interection, an intersection of ROR index scans 
      scan  ROR scan that may or may not improve the selectivity
            of 'info'

  NOTES
    Suppose we have conditions on several keys
    cond=k_11=c_11 AND k_12=c_12 AND ...  // key_parts of first key in 'info'
         k_21=c_21 AND k_22=c_22 AND ...  // key_parts of second key in 'info'
          ...
         k_n1=c_n1 AND k_n3=c_n3 AND ...  (1) //key_parts of 'scan'

    where k_ij may be the same as any k_pq (i.e. keys may have common parts).

    Note that for ROR retrieval, only equality conditions are usable so there
    are no open ranges (e.g., k_ij > c_ij) in 'scan' or 'info'

    A full row is retrieved if entire condition holds.

    The recursive procedure for finding P(cond) is as follows:

    First step:
    Pick 1st part of 1st key and break conjunction (1) into two parts:
      cond= (k_11=c_11 AND R)

    Here R may still contain condition(s) equivalent to k_11=c_11.
    Nevertheless, the following holds:

      P(k_11=c_11 AND R) = P(k_11=c_11) * P(R | k_11=c_11).

    Mark k_11 as fixed field (and satisfied condition) F, save P(F),
    save R to be cond and proceed to recursion step.

    Recursion step:
    We have a set of fixed fields/satisfied conditions) F, probability P(F),
    and remaining conjunction R
    Pick next key part on current key and its condition "k_ij=c_ij".
    We will add "k_ij=c_ij" into F and update P(F).
    Lets denote k_ij as t,  R = t AND R1, where R1 may still contain t. Then

     P((t AND R1)|F) = P(t|F) * P(R1|t|F) = P(t|F) * P(R1|(t AND F)) (2)

    (where '|' mean conditional probability, not "or")

    Consider the first multiplier in (2). One of the following holds:
    a) F contains condition on field used in t (i.e. t AND F = F).
      Then P(t|F) = 1

    b) F doesn't contain condition on field used in t. Then F and t are
     considered independent.

     P(t|F) = P(t|(fields_before_t_in_key AND other_fields)) =
          = P(t|fields_before_t_in_key).

     P(t|fields_before_t_in_key) = #records(fields_before_t_in_key) /
                                   #records(fields_before_t_in_key, t)

    The second multiplier is calculated by applying this step recursively.

  IMPLEMENTATION
    This function calculates the result of application of the "recursion step"
    described above for all fixed key members of a single key, accumulating set
    of covered fields, selectivity, etc.

    The calculation is conducted as follows:
    Lets denote #records(keypart1, ... keypartK) as n_k. We need to calculate

     n_{k1}      n_{k2}
    --------- * ---------  * .... (3)
     n_{k1-1}    n_{k2-1}

    where k1,k2,... are key parts which fields were not yet marked as fixed
    ( this is result of application of option b) of the recursion step for
      parts of a single key).
    Since it is reasonable to expect that most of the fields are not marked
    as fixed, we calculate (3) as

                                  n_{i1}      n_{i2}
    (3) = n_{max_key_part}  / (   --------- * ---------  * ....  )
                                  n_{i1-1}    n_{i2-1}

    where i1,i2, .. are key parts that were already marked as fixed.

    In order to minimize number of expensive records_in_range calls we
    group and reduce adjacent fractions. Note that on the optimizer's
    request, index statistics may be used instead of records_in_range
    @see RANGE_OPT_PARAM::use_index_statistics.

  RETURN
    Selectivity of given ROR scan, a number between 0 and 1. 1 means that
    adding 'scan' to the intersection does not improve the selectivity.
*/

註釋想說明的就是選擇因子的機率如何進行計算，其實就是不一樣INDEX之間差別性的索引列會引發選擇因子不斷變小，即 Index之間差別性越大，過濾的記錄就越多，選擇出來的數據集就會越少。INDEX的差別性就是INdex之間索引列列是否重複出如今不一樣索引之間，兩個INDEX約類似，那麼MERGE的結果集越大。具體的實現你們本身看看吧，明白了原理，實現都是浮雲了。優化

BTW, 5.6的Optimizer trace十分好用，對於想要跟蹤Optimizer內部的同窗來講，能夠先把詳細的計劃生成流程經過Optimizer trace 打印出來，對照優化流程，就能更好的定位到代碼。ui