【GreatSQL 优化器 -18】GROUP_INDEX_SKIP_SCAN

2025-03-26
福建
本文字数：12352 字
阅读完需：约 41 分钟

【GreatSQL 优化器-18】GROUP_INDEX_SKIP_SCAN

一、GROUP_INDEX_SKIP_SCAN 介绍

GreatSQL 优化器的分组索引跳跃扫描（GROUP Index Skip Scan）是一种优化查询的技术，尤其在联合索引中用于减少扫描的无效行数。group by 操作在没有合适的索引可用的时候，通常先扫描整个表提取数据并创建一个临时表，然后按照 group by 指定的列进行排序。在这个临时表里面，对于每一个 group 的数据行来说是连续在一起的。完成排序之后，就可以发现所有的 groups，并可以执行聚集函数（aggregate function）。可以看到，在没有使用索引的时候，需要创建临时表和排序。在执行计划中通常可以看到“Using temporary; Using filesort”

GROUP 索引跳跃扫描利用的是联合索引中非首列（非最左前缀）的索引列，来提高查询效率。例如，假如有个查询需求，需要查某个列的 distinct 值，或者 group by 之后的值 MIN()/MAX() 值，最简单的方式是扫描整个数据页，然后分组排序后，取 DISTINCT/MIN/MAX 值，但由于索引本身就有序并且完成了 group by 工作，如果可以直接借助于这个索引的有序性，那么扫描整个索引就可以避免二次排序的开销。这个功能支持带有聚合函数+GROUP BY 或者聚合函数+DISTINCT 或者 DISTINCT 的 SQL 使用。

这个功能类似于 INDEX_SKIP_SCAN，做一次对相同的 key 值进行 skip 动作，即可以跳过了索引上相同的段，这样相比较与索引扫描而言，减少了很多的索引扫描，索引稀疏性越好，性能就会相对更好。

下面用一个简单的例子来说明 GROUP_INDEX_SKIP_SCAN 怎么应用在优化器。

CREATE TABLE t1(c1 INT, c2 INT, c3 INT, c4 INT);CREATE UNIQUE INDEX i1_t1 ON t1(c1,c2,c3);INSERT INTO t1 VALUES (1,1,1,1), (1,1,2,2), (1,3,3,3), (1,4,4,4), (1,5,5,5),                      (2,1,1,1), (2,2,2,2), (2,3,3,3), (2,4,4,4), (2,5,5,5);INSERT INTO t1 SELECT c1, c2, c3+5, c4+10  FROM t1;INSERT INTO t1 SELECT c1, c2, c3+10, c4+20 FROM t1;INSERT INTO t1 SELECT c1, c2, c3+20, c4+40 FROM t1;INSERT INTO t1 SELECT c1, c2, c3+40, c4+80 FROM t1;ANALYZE TABLE t1;
-- 下面的结果用到了group index skip scan，扫描方式是RANGE SCAN。greatsql> explain SELECT c1, MIN(c2) FROM t1 GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | range | i1_t1         | i1_t1 | 10      | NULL |    3 |   100.00 | Using index for group-by |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+
/* 运行过程：1、先找到第一列的唯一值，得到结果为1和2SELECT distinct c1 FROM t1; 2、根据第一列的分组结果在分组内对c2执行范围扫描SELECT c1, MIN(c2) FROM t1 WHERE c1 = 1;SELECT c1, MIN(c2) FROM t1 WHERE c1 = 2;*/
-- 如果没有联合索引，那么这条sql内部要创建临时表用于数据的临时处理，扫描方式是全表扫描。可见用GROUP_INDEX_SKIP_SCAN方式执行groupby可以节省空间和开销，提升执行效率。
greatsql> DROP INDEX i1_t1 ON t1;greatsql> explain SELECT c1, MIN(c2) FROM t1 GROUP BY c1;+----+-------------+-------+------------+------+---------------+------+---------+------+------+----------+-----------------+| id | select_type | table | partitions | type | possible_keys | key  | key_len | ref  | rows | filtered | Extra           |+----+-------------+-------+------------+------+---------------+------+---------+------+------+----------+-----------------+|  1 | SIMPLE      | t1    | NULL       | ALL  | NULL          | NULL | NULL    | NULL |  160 |   100.00 | Using temporary |+----+-------------+-------+------------+------+---------------+------+---------+------+------+----------+-----------------+

复制代码

二、get_best_group_min_max 代码解释

group index skip scan是根据条件和聚合函数列以及 distinct 列，以及查找对应的联合索引进行判定能不能用group index skip scan，其中联合索引中聚合函数列或者 distinct 列之前的列为 prefix 列，即用来作为分组的依据的列，如果有多个除了第一列之外的范围列，那么选取遇到的第一个聚合函数列之前的列为前缀进行分组。

group index skip scan功能没有相关可以开启关闭的关键词，默认满足条件就可以使用，因此如果不想使用就要改变条件或者 group by 分组，只要不满足条件就不会使用联合索引。

int test_quick_select() {  // 先判断GROUP_INDEX_SKIP_SCAN是否满足条件，不满足的话执行索引合并。  AccessPath *group_path = get_best_group_min_max();  // 不满足GROUP_INDEX_SKIP_SCAN执行索引合并，相关知识看之前知识介绍}
AccessPath *get_best_group_min_max() {  // 遍历表所有索引，找到联合索引  for (uint cur_param_idx = 0; cur_param_idx < param->keys; ++cur_param_idx) {    // 处理group by语句    if (!join->group_list.empty()) {      // 找到联合索引内的列字段相关mm tree      if (tree) cur_tree = get_index_range_tree(cur_index, tree, param);      // 该条件是否是等号条件或者IS NULL条件          is_eq_range_pred = !(range->min_flag & is_open_range) &&                             !(range->max_flag & is_open_range) &&                             ((range->maybe_null() && range->min_value[0] &&                               range->max_value[0]) ||                              memcmp(range->min_value, range->max_value,                                     cur_part->store_length) == 0);    }    // 处理distinct语句    if ((join->group_list.empty() && join->select_distinct) ||        is_agg_distinct) {      检查distinct后面的列字段，必须包含联合索引前缀列或者所有列    }    // 获取第一个不在GROUP BY的索引字段    first_non_group_part =        (cur_group_key_parts < actual_key_parts(cur_index_info))            ? cur_index_info->key_part + cur_group_key_parts            : nullptr;    // 获取聚合函数的索引字段    first_non_infix_part = cur_min_max_arg_part                               ? (cur_min_max_arg_part < last_part)                                     ? cur_min_max_arg_part                                     : nullptr                               : nullptr;    // 接下来是对sql语句的各种检查，确认能否用group index skip scan，代码略，具体见表一
    // 获取聚合函数列与第一个非GROUP BY列之间的差值    key_infix_parts = cur_key_infix_len                          ? (uint)(first_non_infix_part - first_non_group_part)                          : 0;    // 获取GROUP BY列与聚合函数列之间的列个数    cur_used_key_parts = cur_group_key_parts + key_infix_parts;    // 计算range范围内行数    if (tree) {      cur_quick_prefix_records = check_quick_select();    }    // 计算rows和cost    cost_group_min_max();  }  // 最后生成AccessPath  AccessPath *path = new (param->return_mem_root) AccessPath;  path->type = AccessPath::GROUP_INDEX_SKIP_SCAN;}
// 计算rows和cost，这里主要展示的是rows的计算公式static void cost_group_min_max() {  table_records = table->file->stats.records;  keys_per_block = (table->file->stats.block_size / 2 /                        (index_info->key_length + table->file->ref_length) +                    1);  num_blocks = (uint)(table_records / keys_per_block) + 1;  if (index_info->has_records_per_key(group_key_parts - 1))    // Use index statistics    keys_per_group = index_info->records_per_key(group_key_parts - 1);  else    /* If there is no statistics try to guess */    keys_per_group = guess_rec_per_key(table, index_info, group_key_parts);
  if (single_group)    num_groups = 1;  else {    num_groups = (uint)(table_records / keys_per_group) + 1;    if (range_tree && (quick_prefix_records != HA_POS_ERROR)) {      quick_prefix_selectivity =          (double)quick_prefix_records / (double)table_records;      num_groups = (uint)rint(num_groups * quick_prefix_selectivity);      num_groups = std::max(num_groups, 1U);    }  }}
最后具体实现过程见代码GroupIndexSkipScanIterator::Read()，这里不再展开赘述。

复制代码

表一：不能使用 GROUP_INDEX_SKIP_SCAN 的场合

三、实际例子说明

接下来看几个例子来说明上面的代码。

greatsql> explain SELECT c1,c2,MIN(c3) FROM t1 where c1>1 and c2=1  GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+---------------------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                                 |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+---------------------------------------+|  1 | SIMPLE      | t1    | NULL       | range | i1_t1         | i1_t1 | 15      | NULL |    2 |   100.00 | Using where; Using index for group-by |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+---------------------------------------+                 "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "index_dives_for_eq_ranges": true,                        "ranges": [                          "1 < c1"                        ],                        "rows": 2, 用group index skip scan扫描满足条件c1>1 and c2=1的一共2条数据，计算见下面                        "cost": 0.55                      }                    ]                  },                  "best_group_range_summary": {                    "type": "index_group",                    "index": "i1_t1",                    "group_attribute": "c3",                    "min_aggregate": true,                    "max_aggregate": false,                    "distinct_aggregate": false,                    "rows": 2,                    "cost": 0.55,                    "key_parts_used_for_access": [                      "c1",                      "c2",                      "c3"                    ],                    "ranges": [                      "1 < c1"                    ],                    "chosen": true                  },                  "skip_scan_range": {                    "chosen": false,                    "cause": "has_group_by"                  },                  "analyzing_range_alternatives": {                    "range_scan_alternatives": [                      {                        "index": "i1_t1",                        "ranges": [                          "1 < c1"                        ],                        "index_dives_for_eq_ranges": true,                        "rowid_ordered": false,                        "using_mrr": false,                        "index_only": true,                        "in_memory": 1,                        "rows": 80, 单独用索引i1_t1满足1 < c1条件的有80条                        "cost": 8.31051,                        "chosen": false,                        "cause": "cost"                      }                    ],                    "analyzing_roworder_intersect": {                      "usable": false,                      "cause": "too_few_roworder_scans"                    }                  },                  "chosen_range_access_summary": {                    "range_access_plan": {                      "type": "index_group",                      "index": "i1_t1",                      "group_attribute": "c3",                      "min_aggregate": true,                      "max_aggregate": false,                      "distinct_aggregate": false,                      "rows": 2,                      "cost": 0.55,                      "key_parts_used_for_access": [                        "c1",                        "c2",                        "c3"                      ],                      "ranges": [                        "1 < c1"                      ]                    },                    "rows_for_plan": 2,                    "cost_for_plan": 0.55,                    "chosen": true                  }                }
/* 计算方式：table_records = 160keys_per_block = 391num_blocks = (table_records / keys_per_block) + 1 = 160 / 391 + 1 = 1keys_per_group = 80 -- group by字段分组后每组的行数num_groups = (table_records / keys_per_group) + 1 = 160 / 80 + 1 = 3quick_prefix_selectivity = quick_prefix_records / table_records = 80 / 160 = 0.5 -- 计算前缀选择系数num_groups = num_groups * quick_prefix_selectivity = 3 * 0.5 = 2 */

复制代码

下面看一下 distinct 的例子：

greatsql> explain SELECT distinct c1,c2 from t1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | range | i1_t1         | i1_t1 | 10      | NULL |   10 |   100.00 | Using index for group-by |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+                 "group_index_range": {                    "distinct_query": true, -- 这里说明用到了distinct关键字                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "rows": 10,                        "cost": 1.75                      }                    ]                  },                  "best_group_range_summary": {                    "type": "index_group",                    "index": "i1_t1",                    "group_attribute": null,                    "min_aggregate": false,                    "max_aggregate": false,                    "distinct_aggregate": false,                    "rows": 10,                    "cost": 1.75,                    "key_parts_used_for_access": [                      "c1",                      "c2"                    ],                    "ranges": [                    ],                    "chosen": true                  },
-- 当然也支持聚合函数+distinct+group by的组合。greatsql> explain SELECT c1,sum(distinct c2) from t1 group by c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | range | i1_t1         | i1_t1 | 10      | NULL |   10 |   100.00 | Using index for group-by |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+

复制代码

下面例子都是对上面表格的例子：

-- 查询条件既包含MIN/MAX列又包含非MIN/MAX列greatsql> explain SELECT c1, MIN(c2) FROM t1 where c3>1 or c2<6 GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |    55.55 | Using where; Using index |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+                  "best_covering_index_scan": {                    "index": "i1_t1",                    "cost": 17.4066,                    "chosen": true                  },                  "setup_range_conditions": [                  ],                  "range_scan_possible": false,                  "cause": "condition_always_true",                  "group_index_range": {                    "chosen": false,                    "cause": "minmax_keypart_in_disjunctive_query"                  },                  "skip_scan_range": {                    "chosen": false,                    "cause": "has_group_by"                  }
-- group by字段是联合索引的第一列greatsql> explain SELECT c2, MIN(c1) FROM t1 GROUP BY c2 ;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+------------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                        |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+------------------------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |   100.00 | Using index; Using temporary |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+------------------------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "group_attribute_not_prefix_in_index"                      }                    ]                  }
-- 带有聚合函数的group by语句且没有条件，聚合函数列大于group by的列greatsql> explain SELECT c1, MIN(c3) FROM t1 GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+-------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra       |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+-------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |   100.00 | Using index |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+-------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "no_nongroup_keypart_predicate"                      }                    ]                  },
-- 带有聚合函数的group by语句的sql涉及聚合函数之后的列greatsql> explain SELECT c1,MIN(c2) FROM t1 where c3=2 GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |    10.00 | Using where; Using index |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "keypart_after_infix_in_query"                      }                    ]                  },
-- 带有聚合函数的group by语句的条件不是聚合函数列之前一个列的等号条件greatsql> explain SELECT c1,c2,MIN(c3) FROM t1 where c3=2 GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |    10.00 | Using where; Using index |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "non_equality_gap_attribute"                      }                    ]                  },                  -- 聚合函数列小于等于group by字段greatsql> explain SELECT c1,c2,MIN(c2) FROM t1 where c1=2 and c2=2  GROUP BY c1,c2;+----+-------------+-------+------------+------+---------------+-------+---------+-------------+------+----------+-------------+| id | select_type | table | partitions | type | possible_keys | key   | key_len | ref         | rows | filtered | Extra       |+----+-------------+-------+------------+------+---------------+-------+---------+-------------+------+----------+-------------+|  1 | SIMPLE      | t1    | NULL       | ref  | i1_t1         | i1_t1 | 10      | const,const |   16 |   100.00 | Using index |+----+-------------+-------+------------+------+---------------+-------+---------+-------------+------+----------+-------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "aggregate_column_not_suffix_in_idx"                      }                    ]                  },
-- 带有聚合函数的group by语句且没有条件，聚合函数列大于group by的列greatsql> explain SELECT c1,c2,MIN(c3) FROM t1  GROUP BY c1;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+-------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra       |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+-------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |   100.00 | Using index |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+-------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "no_nongroup_keypart_predicate"                      }                    ]                  },
-- 没有聚合函数但是有group by语句的条件带有非select和group by的条件列greatsql> explain SELECT c1,c2 FROM t1 where c1=1 or c3=1 GROUP BY c1,c2;+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+| id | select_type | table | partitions | type  | possible_keys | key   | key_len | ref  | rows | filtered | Extra                    |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+|  1 | SIMPLE      | t1    | NULL       | index | i1_t1         | i1_t1 | 15      | NULL |  160 |    55.00 | Using where; Using index |+----+-------------+-------+------------+-------+---------------+-------+---------+------+------+----------+--------------------------+                  "group_index_range": {                    "potential_group_range_indexes": [                      {                        "index": "i1_t1",                        "covering": true,                        "usable": false,                        "cause": "keypart_reference_from_where_clause"                      }                    ]                  },

复制代码

四、总结

从上面group index skip iscan的代码我们了解了组别跳跃扫描的使用条件和使用场景，以及使用规则，这个功能让一些涉及分组和聚合函数执行的时候直接读联合索引就可以很快得到分组数据，避免了无效列的读取，提高了效率，但 sql 使用限制比较多，因此创建联合索引和使用的时候要注意 sql 语句与联合索引的匹配才能用到这个功能。

发布于: 刚刚阅读数: 3

原文链接:【http://xie.infoq.cn/article/ccb3c4553445b7e51ec56e219】。文章转载请联系作者。

GreatSQL

关注

GreatSQL社区 2023-01-31 加入

GreatSQL是由万里数据库维护的MySQL分支，专注于提升MGR可靠性及性能，支持InnoDB并行查询特性，是适用于金融级应用的MySQL分支版本。社区：https://greatsql.cn/ Gitee: https://gitee.com/GreatSQL/GreatSQL

发布

暂无评论

创作场景

【GreatSQL 优化器 -18】GROUP_INDEX_SKIP_SCAN

【GreatSQL 优化器-18】GROUP_INDEX_SKIP_SCAN

一、GROUP_INDEX_SKIP_SCAN 介绍

二、get_best_group_min_max 代码解释

三、实际例子说明

四、总结

GreatSQL

评论