写点什么

【GreatSQL 优化器 -13】直方图

作者:GreatSQL
  • 2025-02-14
  • 本文字数:5784 字

    阅读完需:约 19 分钟

【GreatSQL 优化器-13】直方图

一、直方图介绍

GreatSQL 的优化器负责将 SQL 查询转换为尽可能高效的执行计划,但因为数据环境不断变化有可能导致优化器对查询数据了解不够充足,可能无法生成最优的执行计划进而影响查询效率,因此推出了直方图(histogram)功能来解决该问题。


直方图用于统计字段值的分布情况,向优化器提供统计信息。利用直方图,可以对一张表的一列数据做分布统计,估算 WHERE 条件中过滤字段的选择率,从而帮助优化器更准确地估计查询过程中的行数,选择更高效的查询计划。


直方图以灵活的 JSON 的格式存储。ANALYZE TABLE会基于表大小自动判断是否要进行取样操作。


ANALYZE TABLE也会基于表中列的数据分布情况以及 bucket 的数量来决定是否要建立等宽直方图(singleton)还是等高直方图(equi-height)。


下面用一个简单的例子来说明直方图是什么。


greatsql> CREATE TABLE t1 (c1 INT PRIMARY KEY, c2 INT,date1 DATETIME);greatsql> INSERT INTO t1 VALUES (1,10,'2021-03-25 16:44:00.123456'),(2,1,'2022-03-26 16:44:00.123456'),(3,4,'2023-03-27 16:44:00.123456'),(5,5,'2024-03-25 16:44:00.123456'),(7,null,'2020-03-25 16:44:00.123456'),(8,10,'2020-10-25 16:44:00.123456'),(11,16,'2023-03-25 16:44:00.123456');greatsql> CREATE TABLE t2 (cc1 INT PRIMARY KEY, cc2 INT);greatsql> INSERT INTO t2 VALUES (1,3),(2,1),(3,2),(4,3),(5,15);greatsql> CREATE TABLE t3 (ccc1 INT, ccc2 varchar(100));greatsql> INSERT INTO t3 VALUES (1,'aa1'),(2,'bb1'),(3,'cc1'),(4,'dd1'),(null,'ee');greatsql> CREATE INDEX idx1 ON t1(c2);greatsql> CREATE INDEX idx2 ON t1(c2,date1);greatsql> CREATE INDEX idx2_1 ON t2(cc2);greatsql> CREATE INDEX idx3_1 ON t3(ccc1);
系统自动创建buckets:greatsql> ANALYZE TABLE t1 UPDATE HISTOGRAM ON c2,date1 WITH 3 BUCKETS;greatsql> SELECT json_pretty(histogram)result from information_schema.column_statistics where table_name = 't1';| { "buckets": [ [ 1, 最小值 5, 最大值 0.42857142857142855, 频率 3 key个数 ], [ 10, 10, 0.7142857142857143, 1 ], [ 16, 16, 0.8571428571428571, 1 ] ], "data-type": "int", "null-values": 0.14285714285714285, "collation-id": 8, "last-updated": "2024-10-22 08:38:48.858099", "sampling-rate": 1.0, "histogram-type": "equi-height", "number-of-buckets-specified": 3} || { "buckets": [ [ "2020-03-25 16:44:00.000000", "2021-03-25 16:44:00.000000", 0.42857142857142855, 3 ], [ "2022-03-26 16:44:00.000000", "2023-03-27 16:44:00.000000", 0.8571428571428571, 3 ], [ "2024-03-25 16:44:00.000000", "2024-03-25 16:44:00.000000", 1.0, 1 ] ], "data-type": "datetime", "null-values": 0.0, "collation-id": 8, "last-updated": "2024-10-22 08:38:48.859681", "sampling-rate": 1.0, "histogram-type": "equi-height", "number-of-buckets-specified": 3} |
用户手动指定buckets:greatsql> ANALYZE TABLE t2 UPDATE HISTOGRAM ON cc2 USING DATA '{"buckets": [[1, 0.25], [2, 0.5], [3, 0.625], [15, 0.75]], "data-type": "int", "null-values": 0.25, "collation-id": 8, "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 4}';greatsql> select json_pretty(histogram)result from information_schema.column_statistics where table_name = 't2';| { "buckets": [ [ 1, 值 0.25 值占总数百分比 ], [ 2, 0.5 ], [ 3, 0.625 ], [ 15, 0.75 ] ], "data-type": "int", "null-values": 0.25, "collation-id": 8, "last-updated": "2024-10-23 02:14:04.474196", "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 4}
复制代码

二、update_histogram 代码解释

histogram.h/histogram.cc涉及直方图相关调用接口,等高直方图创建在equi_height.cc,等宽直方图创建在singleton.cc


bool update_histogram(THD *thd, Table_ref *table, const columns_set &columns,                      int num_buckets, LEX_STRING data, results_map &results) {  // UPDATE HISTOGRAM指定格式的直方图创建  if (data.str != nullptr) {    // Convert JSON to histogram    histograms::Histogram *histogram = Histogram::json_to_histogram();    // 直方图持久化    histogram->store_histogram(thd);  }  // Read data from the table into the Value_maps we have prepared.  // 根据随机抽样原则,从引擎抽样读取数据存入value_map,value_map结构为{唯一值,个数},抽样率计算见表三  if (fill_value_maps(resolved_fields, sample_percentage, tbl, value_maps))    return true;  // 针对每个指定列创建直方图  for (const Field *field : resolved_fields) {    // 按照下面表一规则创建直方图,把value_map的key值分配到每个桶,分配原则见函数build_histogram    value_maps.at(field->field_index())->build_histogram();  }}
// 等高直方图创建bool Equi_height<T>::build_histogram(const Value_map<T> &value_map, size_t num_buckets) { for (; freq_it != value_map.end(); ++freq_it) { 添加数据到桶的规则: 1、该数据不是key值的最后一条 2、剩余的key值个数>剩余空桶数量 3、添加数据进去不会导致桶大小溢出,因为不知道每个key分别有多少个对应value,这里bucket_max_values用总行数(扣除null值后)进行二分法后用贪婪算法算出来的,一旦发现桶不够用了马上转到下一次二分法重新装数据计算,二分法最多次数10次,因此算出来的桶的高度可能偏大。详细见FindBucketMaxValues函数 if (next != value_map.end() && distinct_values_remaining > empty_buckets_remaining && bucket_values + next->second <= bucket_max_values) { continue; } // 计算数据个数占总数的百分比,※注意这里的总个数包含null值 double cumulative_frequency = cumulative_values / static_cast<double>(total_values); if (m_buckets.push_back(bucket)) return true; }}
// 等宽直方图创建bool Singleton<T>::build_histogram(const Value_map<T> &value_map, size_t num_buckets) { const ha_rows total_count = value_map.get_num_null_values() + num_non_null_values; for (const auto &node : value_map) { cumulative_sum += node.second; // 按照数据个数求占总数据的百分比,※注意这里的总个数包含null值 const double cumulative_frequency = cumulative_sum / static_cast<double>(total_count); m_buckets.push_back(SingletonBucket<T>(node.first, cumulative_frequency)); } }
复制代码


表一:直方图类型创建规则



表二:直方图不支持以下场景



表三:涉及的系统变量



表四:抽样率计算


三、实际例子说明

接下来看几个例子来说明上面的代码。


greatsql> CREATE TABLE t4 (d1 INT, d2 int, d3 varchar(100));greatsql> INSERT INTO t4 VALUES (1,2,'aa1'),(2,1,'bb1'),(2,3,'cc1'),(3,3,'cc1'),(4,2,'ff1'),(4,4,'ert'),(4,2,'f5fg'),(null,2,'ee'),(5,30,'cc1'),(5,4,'fcc1'),(4,10,'cc1'),(6,4,'ccd1'),(null,1,'fee'),(1,2,'aa1'),(2,1,'bb1'),(2,3,'cc1'),(3,3,'cc1'),(4,2,'ff1'),(4,4,'ert'),(4,2,'f5fg'),(null,2,'ee'),(5,30,'cc1'),(5,4,'fcc1'),(4,10,'cc1'),(6,4,'ccd1'),(null,1,'fee'),(1,2,'aa1'),(2,1,'bb1'),(2,3,'cc1'),(3,3,'cc1'),(4,2,'ff1'),(4,4,'ert'),(4,2,'f5fg'),(null,2,'ee'),(5,30,'cc1'),(5,4,'fcc1'),(4,10,'cc1'),(6,4,'ccd1'),(null,1,'fee');
查看数据分布情况,发现d1存在数据倾斜。下面的格式即value_map的格式greatsql> SELECT d1,count(*) FROM t4 GROUP BY d1; +------+----------+| d1 | count(*) |+------+----------+| 1 | 3 || 2 | 6 || 3 | 3 || 4 | 12 || NULL | 6 || 5 | 6 || 6 | 3 |+------+----------+7 rows in set (0.00 sec)
复制代码


首先创建自动等高直方图


-- 先创建等高直方图,t4表扣掉null值一共33行,这里创建出来每个桶最多装12行数据。greatsql> ANALYZE TABLE t4 UPDATE HISTOGRAM ON d1 WITH 3 BUCKETS;greatsql> SELECT json_pretty(histogram)result FROM information_schema.column_statistics WHERE table_name = 't4';| {  "buckets": [    [      1,      3,      0.3076923076923077, 计算公式9/39 ※注意这里的总个数包含null值      3    ],    [      4,      4,      0.6153846153846154,      1    ],    [      5,      6,      0.8461538461538461,      2    ]  ],  "data-type": "int",  "null-values": 0.15384615384615385,  "collation-id": 8,  "last-updated": "2024-10-24 03:15:54.463774",  "sampling-rate": 1.0,  "histogram-type": "equi-height",  "number-of-buckets-specified": 3} |
-- 先创建等高直方图,t4表扣掉null值一共33行,这里创建出来每个桶最多装9行数据。greatsql> ANALYZE TABLE t4 UPDATE HISTOGRAM ON d1 WITH 5 BUCKETS;greatsql> SELECT json_pretty(histogram)result FROM information_schema.column_statistics WHERE table_name = 't4';| { "buckets": [ [ 1, 2, 0.23076923076923078, 2 ], [ 3, 3, 0.3076923076923077, 1 ], [ 4, 4, 0.6153846153846154, 1 ], [ 5, 5, 0.7692307692307693, 1 ], [ 6, 6, 0.8461538461538461, 1 ] ], "data-type": "int", "null-values": 0.15384615384615385, "collation-id": 8, "last-updated": "2024-10-24 06:42:40.102386", "sampling-rate": 1.0, "histogram-type": "equi-height", "number-of-buckets-specified": 5} |
复制代码


等高直方图不同桶装的最多数据个数bucket_max_values



接着创建自动等宽直方图


-- 先创建等高直方图,t4表扣掉null值一共33行,这里创建出来每个桶最多装12行数据。greatsql> ANALYZE TABLE t4 UPDATE HISTOGRAM ON d1 WITH 6 BUCKETS;greatsql> select json_pretty(histogram)result from information_schema.column_statistics where table_name = 't4';| {  "buckets": [    [      1,      0.07692307692307693 计算公式3/39 ※注意这里的总个数包含null值    ],    [      2,      0.23076923076923078    ],    [      3,      0.3076923076923077    ],    [      4,      0.6153846153846154    ],    [      5,      0.7692307692307693    ],    [      6,      0.8461538461538461    ]  ],  "data-type": "int",  "null-values": 0.15384615384615385,  "collation-id": 8,  "last-updated": "2024-10-24 06:53:37.256033",  "sampling-rate": 1.0,  "histogram-type": "singleton",  "number-of-buckets-specified": 6} |
复制代码



下面看一个sampling-rate小于 1 的例子


greatsql> CREATE TABLE t5 (d1 INT, d2 int, d3 varchar(100),d4 varchar(100),d5 varchar(100),d6 varchar(100));greatsql> SET sql_mode=ORACLE;greatsql> DELIMITER $$greatsql> CREATE or replace PROCEDURE p1() asBEGIN  for i in 1 .. 1000 loop    INSERT INTO t5 VALUES (i,rand()*1000,'aaaaaaaaaa'||i,'bbbb'||i,'cccccc'||i,'ddddddd'||i);  end loop;END;$$DELIMITER ;greatsql> call p1();-- 把系统变量histogram_generation_max_mem_size设置为最小值greatsql> SET @@session.histogram_generation_max_mem_size = 1000000;-- 为了让每个样本更大,这里创建多列直方图greatsql> ANALYZE TABLE t5 UPDATE HISTOGRAM ON d2,d3,d4,d5,d6 WITH 5 BUCKETS;greatsql> select json_pretty(histogram)result from information_schema.column_statistics where table_name = 't5';result: {  "buckets": [    [      1,      187,      0.2009987515605493,      119    ],    [      189,      373,      0.4019975031210986,      116    ],    [      374,      577,      0.602996254681648,      117    ],    [      578,      783,      0.8039950062421972,      112    ],    [      784,      998,      1.0,      112    ]  ],  "data-type": "int",  "null-values": 0.0,  "collation-id": 8,  "last-updated": "2024-10-24 07:44:31.520442",  "sampling-rate": 0.8741258741258742, 这里看到抽样率是87%  "histogram-type": "equi-height",  "number-of-buckets-specified": 5}
复制代码

四、总结

从上面直方图创建的步骤我们认识了直方图的类型和创建方法,包括自动和手动两种,以及等宽和等高直方图的区别,学会了查看直方图的桶个数和数据,如果表是一张大表的话,想让样本尽可能多的被抽样,那么系统变量histogram_generation_max_mem_size就设置大一点,这样精确度更高,当然相对的更占硬盘资源。下一节讲直方图的应用。


发布于: 刚刚阅读数: 5
用户头像

GreatSQL

关注

GreatSQL社区 2023-01-31 加入

GreatSQL是由万里数据库维护的MySQL分支,专注于提升MGR可靠性及性能,支持InnoDB并行查询特性,是适用于金融级应用的MySQL分支版本。 社区:https://greatsql.cn/ Gitee: https://gitee.com/GreatSQL/GreatSQL

评论

发布
暂无评论
【GreatSQL优化器-13】直方图_GreatSQL_InfoQ写作社区