【GreatSQL 优化器 -02】索引和 Sargable 谓词
- 2024-11-15 福建
本文字数:11967 字
阅读完需:约 39 分钟
【GreatSQL 优化器-02】索引和 Sargable 谓词
一、Sargable 谓词介绍
GreatSQL 的优化器在有过滤条件的时候,需要先把条件按照是否有索引来进行区分,可以用索引来加速查询的条件称为 Sargable,其中 arge 来源于 Search Argument(搜索参数)的首字母拼成的"SARG"。GreatSQL 用keyuse_array
索引数组和 Sargables 数组来储存 Sargable 谓词,其中 Sargable 数组是对keyuse_array
的补充使用,比如``a<b``
条件如果 a 列上建立有索引并且 b 不是常量,a 存入keyuse_array
数组,而这个 b 就存入 Sargable 数组。因此 Sargable 也可以叫做 可索引谓词。
Sargable 谓词在优化器执行make_join_plan
的一开始就通过update_ref_and_keys
提取出来了,用于对keyuse_array
索引数组没有保存到的索引做补充。
生成的keyuse_array
数组在后面的JOIN::optimize_keyuse_array
函数里面会给每个索引的ref_table_rows
变量赋值:一个索引数据对应多少行表数据,这样在后续Optimize_TABLE_order::find_best_ref
计算最佳执行计划表顺序的时候可以根据ref_table_rows
变量估计扫描开销,提取到的keyuse_array
会让表的扫描方式走ref
或者eq_ref
方式扫描(见下表<<join_type 表扫描方式>>),如果没有keyuse_array
的话就是用固定数值进行预估,这样算出来的表的read_cost
是不准确的,表的扫描方式走range
或者scan
方式扫描,这样执行的是全表或索引扫描,这会引起查询的性能下降。
下面用一个简单的例子来说明 Sargable 谓词是什么。
CREATE TABLE t1 (c1 INT PRIMARY KEY, c2 INT,date1 DATETIME);
INSERT INTO t1 VALUES (1,10,'2021-03-25 16:44:00.123456'),(2,1,'2022-03-26 16:44:00.123456'),(3,4,'2023-03-27 16:44:00.123456');
CREATE TABLE t2 (cc1 INT PRIMARY KEY, cc2 INT);
INSERT INTO t2 VALUES (1,3),(2,1),(3,2),(4,3);
CREATE INDEX idx1 ON t1(c2);
CREATE INDEX idx2 ON t1(c2,date1);
CREATE INDEX idx2_1 ON t2(cc2);
SET optimizer_trace = 'enabled=ON' ;
greatsql> SELECT * FROM t1 where c1 in (SELECT cc1 FROM t2) and c1 >c2;
+----+------+---------------------+
| c1 | c2 | date1 |
+----+------+---------------------+
| 2 | 1 | 2022-03-26 16:44:00 |
+----+------+---------------------+
> SELECT * FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
| SELECT * FROM t1 where c1 in (SELECT cc1 FROM t2) and c1 >c2 | {
"steps": [
{// 1、sql语句转换成更快执行的语句
"join_preparatiON": {
"SELECT#": 1,
"steps": [
{
"join_preparatiON": {
"SELECT#": 2,
"steps": [
{
"expanded_query": "/* SELECT#2 */ SELECT `t2`.`cc1` FROM `t2`"
}
]
}
},
{
"expanded_query": "/* SELECT#1 */ SELECT `t1`.`c1` AS `c1`,`t1`.`c2` AS `c2`,`t1`.`date1` AS `date1` FROM `t1` where (`t1`.`c1` in (/* SELECT#2 */ SELECT `t2`.`cc1` FROM `t2`) and (`t1`.`c1` > `t1`.`c2`))"
},
{
"transformatiON": {
"SELECT#": 2,
"FROM": "IN (SELECT)",
"to": "semijoin",
"chosen": true,
"transformatiON_to_semi_join": {
"subquery_predicate": "`t1`.`c1` in (/* SELECT#2 */ SELECT `t2`.`cc1` FROM `t2`)",
"embedded in": "WHERE",
"semi-join cONditiON": "(`t1`.`c1` = `t2`.`cc1`)",
"decorrelated_predicates": [
{
"outer": "`t1`.`c1`",
"inner": "`t2`.`cc1`"
}
]
}
}
},
{
"transformatiONs_to_nested_joins": {
"transformatiONs": [
"semijoin"
],// 这个sql语句被转换成以下最终的语句,可以发现最后以semi join的形式查询的。
"expanded_query": "/* SELECT#1 */ SELECT `t1`.`c1` AS `c1`,`t1`.`c2` AS `c2`,`t1`.`date1` AS `date1` FROM `t1` semi join (`t2`) where ((`t1`.`c1` > `t1`.`c2`) and (`t1`.`c1` = `t2`.`cc1`))"
}
}
]
}
},
{// 2、优化器执行计划生成
"join_optimizatiON": {
"SELECT#": 1,
"steps": [
{
"cONditiON_processing": {
"cONditiON": "WHERE",
"original_cONditiON": "((`t1`.`c1` > `t1`.`c2`) and (`t1`.`c1` = `t2`.`cc1`))",
"steps": [
{
"transformatiON": "equality_propagatiON",
"resulting_cONditiON": "((`t1`.`c1` > `t1`.`c2`) and multiple equal(`t1`.`c1`, `t2`.`cc1`))"
},
{
"transformatiON": "cONstant_propagatiON",
"resulting_cONditiON": "((`t1`.`c1` > `t1`.`c2`) and multiple equal(`t1`.`c1`, `t2`.`cc1`))"
},
{
"transformatiON": "trivial_cONditiON_removal",
"resulting_cONditiON": "((`t1`.`c1` > `t1`.`c2`) and multiple equal(`t1`.`c1`, `t2`.`cc1`))"
}
]
}
},
{
"substitute_generated_columns": {
}
},
{// 表依赖,因为是两张表做join,因此这里显示了2张表
"TABLE_dependencies": [
{
"TABLE": "`t1`",
"row_may_be_null": false,
"map_bit": 0,
"depends_ON_map_bits": [
]
},
{
"TABLE": "`t2`",
"row_may_be_null": false,
"map_bit": 1,
"depends_ON_map_bits": [
]
}
]
},
{
"ref_optimizer_key_uses": [ //这里就是keyuse_array的结果,实际用到了两个索引,c1和cc2的唯一索引
{
"TABLE": "`t1`",
"field": "c1",
"equals": "`t2`.`cc1`",
"null_rejecting": true
},
{
"TABLE": "`t2`",
"field": "cc1",
"equals": "`t1`.`c1`",
"null_rejecting": true
}
]
},
// 通过查看系统表可以查看到keyuse_array信息,但是Sargables数组信息没有显示。
// 这个通过debug代码发现过程中提取出了一个谓词组,就是c1>c2,field值为cc1列,arg_value值为c2列,num_VALUES为所有Sargable谓词数量,这里为1。
// 这个Sargable数组信息在后面函数update_sargable_FROM_cONst补充检查是否可以使用这里面的cc1索引加速查询。
struct SARGABLE_PARAM {
Field *field; // t2的cc1列
Item **arg_value; // t1的c2列
uINT num_VALUES; // 值为1,因为数组只有一个值
};
二、update_ref_and_keys 代码执行过程
update_ref_and_keys
函数里面通过add_key_fields
把查询 sql 的 cond 条件包含的索引信息添加到Key_use_array
,注意只有等于的条件才会通过add_key_field
添加key_field
。cond 条件分为两种:FUNC_ITEM
和COND_ITEM
,其中and_level
用于在merge_key_fields
时候把用不到的key_field
删掉。
实际代码执行过程:
bool JOIN::make_join_plan() {
// Build the key access informatiON, which is the basis for ref access.
//如果有查询条件或者查询语句是outer join的话,需要执行update_ref_and_keys获取所有sargables谓词。
if (where_cond || query_block->outer_join) {
if (update_ref_and_keys(thd, keyuse_array, join_tab, TABLEs, where_cONd,
~query_block->outer_join, query_block, &sargables))
return true;
}
}
// 这里keyuse_array和sargables都是最后生成的结果,即所需的Sargable谓词
static bool update_ref_and_keys(THD *thd, Key_use_array *keyuse_array,
JOIN_TAB *join_tab, uINT TABLEs, Item *cONd,
TABLE_map normal_TABLEs,
Query_block *query_block,
SARGABLE_PARAM **sargables) {
if (cONd) {
if (add_key_fields(thd, join, &end, &and_level, cONd, normal_TABLEs,
sargables))
return true;
}
/* fill keyuse_array with found key parts */
// 把上面找到的带有索引的field加入到keyuse_array数组
for (; field != end; field++) {
if (add_key_part(keyuse_array, field)) return true;//说明见下面
}
if (!keyuse_array->empty()) {
// 对找到的索引按照sort_keyuse_array条件进行比大小排序
std::sTABLE_sort(keyuse_array->begin(), keyuse_array->begin() + keyuse_array->size(),
sort_keyuse);
// 删除不用的索引列以及重复的索引列,这里最后装入的就是t1.c1和t2.cc1两个索引列。
}
}
bool add_key_fields(THD *thd, JOIN *join, Key_field **key_fields,
uINT *and_level, Item *cONd, TABLE_map usable_TABLEs,
SARGABLE_PARAM **sargables) {
List_iterator_fast<Item> li(*((Item_cONd *)cONd)->argument_list());
if (down_cast<Item_cONd *>(cONd)->functype() == Item_func::COND_AND_FUNC) {
//对AND条件的所有参数进行广度和深度遍历,找出涉及的表的列相关索引,and_level不变,也就是说同一层AND连接的Key_field的and_level相同。
while ((item = li++)) {
if (add_key_fields(thd, join, key_fields, and_level, item,
usable_TABLEs, sargables))
return true;
}
} else {
//对OR条件的所有参数进行广度和深度遍历,找出涉及的表的列相关索引,这里多一个merge_key_fields操作,用于对 OR 连接的谓词之间尽可能做 merge 操作
(*and_level)++;
add_key_fields();
merge_key_fields();
}
//按照不同函数的不同SELECT_optimize属性来抽取参数涉及的表列,见下表
auto optimize = cONd_func->SELECT_optimize(thd);
switch (optimize) {
case Item_func::OPTIMIZE_NONE:
break;
case Item_func::OPTIMIZE_KEY:
case Item_func::OPTIMIZE_OP:
case Item_func::OPTIMIZE_NULL:
case Item_func::OPTIMIZE_EQUAL:
}
static Key_field *merge_key_fields(Key_field *start, Key_field *new_fields,
Key_field *end, uint and_level) {
for (; new_fields != end; new_fields++) {
for (Key_field *old = start; old != first_free; old++) {
//如果当前or条件的item_field等于之前已经标记的条件的item_field
1、or条件的值不为常量:改变以下3个值
old->level = and_level;
old->optimize
old->null_rejecting
2、or条件的值等于之前已经标记的条件的值:改变以下3个值
old->level = and_level;
old->optimize
old->null_rejecting
3、or条件的值等于null,并且之前已经标记的条件的值为null并且为常量:改变以下3个值
old->level = and_level;
old->optimize = KEY_OPTIMIZE_REF_OR_NULL;
old->null_rejecting = false;
4、以上都不是,那么可以合并,把之前条件的key_field删除。
}
}
}
函数的 SELECT_optimize 属性见下表。
三、Key_field 举例说明
INSERT INTO t1 VALUES (5,5,'2024-03-25 16:44:00.123456');
INSERT INTO t2 VALUES (5,15);
例子1:SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t2.cc1=t1.c1 or t1.c1=5;
转换为:where (((`t1`.`c1` = `) and (`t2`.`cc1` = `t1`.`c1`)) or (`t1`.`c1` = 5))
对于 t1.c1=t2.cc1 and t2.cc1 = t1.c1 or t1.c1=5
可以生成三个 Key_field:
1. Key_field(c1=cc1, and_level=1)
2. Key_field(cc1=c1, and_level=1)
3. Key_field(c1=5, and_level=2)
做合并之前,先赋值field="cc1=c1"
merge_key_fields()函数执行merge由 OR 连接的左右条件:
->对于 c1=cc1:
->判断c1=5是否可以合并
-> c1相等。可以合并
->将 c1=5 的Key_field删除,剩下c1=cc1,注意这里返回的end="cc1=c1"
->对于 cc1=c1:
->判断 c1=5 是否可以合并
->cc1不等于c1,不能合并
->将所有没有被合并的 Key_field 去掉
最终剩下2个 Key_field:
Key_field(c1=cc1, and_level=1, optimize=0, null_rejecting=true)
Key_field(cc1=c1, and_level=1, optimize=0, null_rejecting=true)
最后因为通过merge_key_fields算出来的field==end,因此不加入keyuse_array,注意只有or条件才会执行merge_key_fields。这里条件如果去掉or t1.c1=5这两个key_field就会加入keyuse_array
于是看到如下的trace,这里面的access_type全是scan方式,说明没有用索引提升查询性能。
"considered_execution_plans": [
{
"plan_prefix": [
],
"TABLE": "`t1`",
"best_access_path": {
"cONsidered_access_paths": [
{
"rows_to_scan": 4,
"filtering_effect": [
],
"final_filtering_effect": 1,
"access_type": "scan", 这里t1的扫描方式是索引扫描
"resulting_rows": 4,
"cost": 0.65,
"chosen": true
}
]
},
"rest_of_plan": [
{
"plan_prefix": [
"`t1`"
],
"TABLE": "`t2`",
"best_access_path": {
"cONsidered_access_paths": [
{
"rows_to_scan": 5,
"filtering_effect": [
],
"final_filtering_effect": 1,
"access_type": "scan", 这里t2的扫描方式是索引扫描
"using_join_cache": true,
"buffers_needed": 1,
"resulting_rows": 5,
"cost": 2.25005,
"chosen": true
}
]
},
-- 下面的结果中,type=index,表明选择了索引扫描,跟上面算出来的结论一致
greatsql> EXPLAIN SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t2.cc1=t1.c1 or t1.c1=5;
+----+-------------+-------+------------+-------+---------------+--------+---------+------+------+----------+---------------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+---------------+--------+---------+------+------+----------+---------------------------------------------------------+
| 1 | SIMPLE | t1 | NULL | index | PRIMARY | idx2 | 11 | NULL | 4 | 100.00 | Using index | 这里选择了索引扫描,跟上面算出来的结论一致
| 1 | SIMPLE | t2 | NULL | index | PRIMARY | idx2_1 | 5 | NULL | 5 | 100.00 | Using where; Using index; Using join buffer (hash join) |
+----+-------------+-------+------------+-------+---------------+--------+---------+------+------+----------+---------------------------------------------------------+
例子2:SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t1.c2<5;
where条件被转换为:where ((`t1`.`c1` = `t2`.`cc1`) and (`t1`.`c2` < 5))
t1.c2<5不是等于条件,因此不生成Key_field,最后生成2个 Key_field:
1. Key_field(c1=cc1, and_level=0)
2. Key_field(cc1=c1, and_level=0)
因为没有OR条件因此不需要进行合并操作,最终剩下以上2个 Key_field:
Key_field(c1=cc1, and_level=0, optimize=0, null_rejecting=true)
Key_field(cc1=c1, and_level=0, optimize=0, null_rejecting=true)
这两个Key_field会加入keyuse_array,在后面Optimize_TABLE_order::find_best_ref被用于执行优化,让表的扫描方式为eq_ref,见下面。
{
"ref_optimizer_key_uses": [ 这里用到了2条包含等号的索引列
{
"TABLE": "`t1`",
"field": "c1",
"equals": "`t2`.`cc1`",
"null_rejecting": true
},
{
"TABLE": "`t2`",
"field": "cc1",
"equals": "`t1`.`c1`",
"null_rejecting": true
}
]
},
"cONsidered_executiON_plans": [
{
"plan_prefix": [
],
"TABLE": "`t1`",
"best_access_path": {
"cONsidered_access_paths": [
{
"access_type": "ref", 这种ref扫描不被选择
"INDEX": "PRIMARY",
"usable": false,
"chosen": false
},
{
"rows_to_scan": 3,
"filtering_effect": [
],
"final_filtering_effect": 1,
"access_type": "range", 第一次需要对于t1做范围扫描
"range_details": {
"used_INDEX": "PRIMARY"
},
"resulting_rows": 3,
"cost": 0.860732,
"chosen": true
}
]
"rest_of_plan": [
{
"plan_prefix": [
"`t1`"
],
"TABLE": "`t2`",
"best_access_path": {
"cONsidered_access_paths": [
{
"access_type": "eq_ref", 对于t2.cc1=t1.c1条件,因为t1.c1固定了因此这里t2的扫描方式可以用eq_ref的方式
"INDEX": "PRIMARY",
"rows": 1,
"cost": 1.05,
"chosen": true,
"cause": "clustered_pk_chosen_by_heuristics"
},
{
"access_type": "range", 这种范围扫描不被选择
"range_details": {
"used_INDEX": "PRIMARY"
},
"chosen": false,
"cause": "heuristic_INDEX_cheaper"
}
]
},
"cONditiON_filtering_pct": 100,
"rows_for_plan": 3,
"cost_for_plan": 1.91073,
"chosen": true
}
]
{
"plan_prefix": [
],
"TABLE": "`t2`",
"best_access_path": {
"cONsidered_access_paths": [
{
"access_type": "ref", 这种ref扫描不被选择
"INDEX": "PRIMARY",
"usable": false,
"chosen": false
},
{
"rows_to_scan": 4,
"filtering_effect": [
],
"final_filtering_effect": 1,
"access_type": "range", 对于t2表用到了范围扫描
"range_details": {
"used_INDEX": "PRIMARY"
},
"resulting_rows": 4,
"cost": 1.06082,
"chosen": true
}
]
},
"rest_of_plan": [
{
"plan_prefix": [
"`t2`"
],
"TABLE": "`t1`",
"best_access_path": {
"cONsidered_access_paths": [
{
"access_type": "eq_ref", 对于t1.c1=t2.cc1条件,因为t2.cc1固定了因此这里t1的扫描方式可以用eq_ref的方式
"INDEX": "PRIMARY",
"rows": 1,
"cost": 1.4,
"chosen": true,
"cause": "clustered_pk_chosen_by_heuristics"
},
{
"access_type": "range", 这种范围扫描不被选择
"range_details": {
"used_INDEX": "PRIMARY"
},
"chosen": false,
"cause": "heuristic_INDEX_cheaper"
}
]
},
-- t1表:type=range,这个扫描方式,下一期讲
-- t2表:type=eq_ref方式扫描
greatsql> EXPLAIN SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t1.c1<5;
+----+-------------+-------+------------+--------+---------------+---------+---------+-----------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+--------+---------------+---------+---------+-----------+------+----------+-------------+
| 1 | SIMPLE | t1 | NULL | range | PRIMARY | PRIMARY | 4 | NULL | 3 | 100.00 | Using where |
| 1 | SIMPLE | t2 | NULL | eq_ref | PRIMARY | PRIMARY | 4 | db1.t1.c1 | 1 | 100.00 | NULL |
+----+-------------+-------+------------+--------+---------------+---------+---------+-----------+------+----------+-------------+
从上面的例子发现有唯一索引查询的时候,条件带有 or 的话是无法使用 Sargable 谓词提高效率的。如果是 and 条件的话,只有等于条件才会被判定为 Sargable 谓词。因此做 join 连接的时候尽量避免 or 条件的过滤。
-- 对比上面2个最后生成的执行计划,第一个预估cost=2.90,第二个预估cost=1.91,效率更高
greatsql> EXPLAIN FORMAT=TREE SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t2.cc1=t1.c1 or t1.c1=5;
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| EXPLAIN |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| -> Filter: ((t2.cc1 = t1.c1) or (t1.c1 = 5)) (cost=2.90 rows=20)
-> Inner hash join (no cONditiON) (cost=2.90 rows=20)
-> INDEX scan ON t2 using idx2_1 (cost=0.19 rows=5)
-> Hash
-> INDEX scan ON t1 using idx2 (cost=0.65 rows=4)
|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
greatsql> EXPLAIN FORMAT=TREE SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t1.c1<5;
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| EXPLAIN |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| -> Nested loop inner join (cost=1.91 rows=3)
-> Filter: (t1.c1 < 5) (cost=0.86 rows=3)
-> INDEX range scan ON t1 using PRIMARY over (c1 < 5) (cost=0.86 rows=3)
-> Single-row INDEX lookup ON t2 using PRIMARY (cc1=t1.c1) (cost=0.28 rows=1)
|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
附录:join_type 表扫描方式
四、总结
从上面优化器最早的步骤我们认识了 Sargable 谓词的定义和判定方法,如果查询用到了 Sargable 谓词是可以进行 eq_ref 扫描方式的,有效提高了查询效率。通过实际例子发现,在做多表连接的时候用 OR 条件会降低执行效率,同时用唯一索引列作为连接条件的话会提高效率。因此实际写查询 sql 的时候,尽量用唯一索引作为连接条件,少用 OR 条件进行过滤。
GreatSQL
GreatSQL社区 2023-01-31 加入
GreatSQL是由万里数据库维护的MySQL分支,专注于提升MGR可靠性及性能,支持InnoDB并行查询特性,是适用于金融级应用的MySQL分支版本。 社区:https://greatsql.cn/ Gitee: https://gitee.com/GreatSQL/GreatSQL
评论