马里亚德·韦里翁
select version();
version() |
-----------------------------------------+
10.4.24-MariaDB-1:10.4.24+maria~focal-log|
字符串
我有一个像下面这样的表sql。col a意味着版本。col b,c,d,e似乎像数据库,模式,表,字段。
CREATE TABLE tt (a int, b varchar(32), c varchar(64), d varchar(64), e varchar(64), f int)
CREATE INDEX tt_a_IDX USING BTREE ON tt (a,b,c,d,e);
型
表计数为510114
第一个问题是关于在特定版本中获取col e count时2个查询之间的差异。
sql1. select count(DISTINCT c,d,e),b from tt where a = 1 group by b;
sql2. select sum(count), b from (
select b,COUNT(DISTINCT e) as count from tt
where a = 1
GROUP BY b,c,d) tt group by b;
id|select_type|table|type|possible_keys|key |key_len|ref |rows |Extra |
--+-----------+-----+----+-------------+--------+-------+-----+------+------------------------+
1|SIMPLE |tt |ref |tt_a_IDX |tt_a_IDX|5 |const|253768|Using where; Using index|
id|select_type|table |type|possible_keys|key |key_len|ref |rows |Extra |
--+-----------+----------+----+-------------+--------+-------+-----+------+-------------------------------+
1|PRIMARY |<derived2>|ALL | | | | |253768|Using temporary; Using filesort|
2|DERIVED |tt |ref |tt_a_IDX |tt_a_IDX|5 |const|253768|Using where; Using index |
型
1平均花费4s,2仅花费数百ms;
为什么第二个sql更快?这是否意味着count(distinct mutil...)可以被“group by mutil and sum”替换为permance?
第二个问题是当我在group by condition中添加列a时,计划显示一个范围类型select。但实际上,它花费了更多的时间。
sql3. select sum(count), b from (
select b,COUNT(DISTINCT e) as count from tt
where a = 1
GROUP BY a,b,c,d) temp group by b;
id|select_type|table |type |possible_keys|key |key_len|ref|rows |Extra |
--+-----------+----------+-----+-------------+--------+-------+---+------+------------------------------------------------+
1|PRIMARY |<derived2>|ALL | | | | |253768|Using temporary; Using filesort |
2|DERIVED |tt |range|tt_a_IDX |tt_a_IDX|689 | |253768|Using where; Using index for group-by (scanning)|
型
平均成本约为3s;
q2.会发生什么?
由于key_len显示在计划中,我删除了索引,只在col a中创建一个新索引
explain select count(DISTINCT c,d,e),b from tt where a = 1 group by b ;
explain select sum(count), b from (
select b,COUNT(DISTINCT e) as count from tt
where a = 1
GROUP BY b,c,d) temp group by b ;
id|select_type|table|type|possible_keys|key |key_len|ref |rows |Extra |
--+-----------+-----+----+-------------+--------+-------+-----+------+---------------------------+
1|SIMPLE |tt |ref |tt_a_IDX |tt_a_IDX|5 |const|253768|Using where; Using filesort|
id|select_type|table |type|possible_keys|key |key_len|ref |rows |Extra |
--+-----------+----------+----+-------------+--------+-------+-----+------+-------------------------------+
1|PRIMARY |<derived2>|ALL | | | | |253768|Using temporary; Using filesort|
2|DERIVED |tt |ref |tt_a_IDX |tt_a_IDX|5 |const|253768|Using where; Using filesort |
型
在解释中,两者差别不大,但两次查询的平均开销分别为5s和4s
q3.这是否意味着索引(a,B,c,d,e)实际上起作用,而不是像计划显示(key_len)那样?
分析结果:
ANALYZE FORMAT=JSON select count(DISTINCT c,d,e),b
from tt where a = 1 group by b;
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 3222.2,
"table": {
"table_name": "tt",
"access_type": "ref",
"possible_keys": ["tt_a_IDX"],
"key": "tt_a_IDX",
"key_length": "5",
"used_key_parts": ["a"],
"ref": ["const"],
"r_loops": 1,
"rows": 253768,
"r_rows": 510114,
"r_total_time_ms": 395.73,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "tt.a <=> 1",
"using_index": true
}
}
}
ANALYZE FORMAT=JSON select sum(count), b
from
(
select b, COUNT(DISTINCT e) as count
from tt where a = 1 GROUP BY b, c, d
) temp
group by b;
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 662.82,
"filesort": {
"sort_key": "tt.b",
"r_loops": 1,
"r_total_time_ms": 0.0086,
"r_limit": 200,
"r_used_priority_queue": false,
"r_output_rows": 16,
"r_buffer_size": "2Kb",
"temporary_table": {
"table": {
"table_name": "<derived2>",
"access_type": "ALL",
"r_loops": 1,
"rows": 253768,
"r_rows": 2652,
"r_total_time_ms": 0.1939,
"filtered": 100,
"r_filtered": 100,
"materialized": {
"query_block": {
"select_id": 2,
"r_loops": 1,
"r_total_time_ms": 661.61,
"table": {
"table_name": "tt",
"access_type": "ref",
"possible_keys": ["tt_a_IDX"],
"key": "tt_a_IDX",
"key_length": "5",
"used_key_parts": ["a"],
"ref": ["const"],
"r_loops": 1,
"rows": 253768,
"r_rows": 510114,
"r_total_time_ms": 245.18,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "tt.a <=> 1",
"using_index": true
}
}
}
}
}
}
}
}
ANALYZE FORMAT=JSON select sum(count), b
from
(
select b, COUNT(DISTINCT e) as count
from tt where a = 1 GROUP BY a,b, c, d
) temp
group by b;
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 3401.7,
"filesort": {
"sort_key": "tt.b",
"r_loops": 1,
"r_total_time_ms": 0.0093,
"r_limit": 200,
"r_used_priority_queue": false,
"r_output_rows": 16,
"r_buffer_size": "2Kb",
"temporary_table": {
"table": {
"table_name": "<derived2>",
"access_type": "ALL",
"r_loops": 1,
"rows": 253768,
"r_rows": 2652,
"r_total_time_ms": 0.406,
"filtered": 100,
"r_filtered": 100,
"materialized": {
"query_block": {
"select_id": 2,
"r_loops": 1,
"r_total_time_ms": 3400.5,
"table": {
"table_name": "tt",
"access_type": "range",
"possible_keys": ["tt_a_IDX"],
"key": "tt_a_IDX",
"key_length": "689",
"used_key_parts": ["a", "b", "c", "d", "e"],
"r_loops": 1,
"rows": 253768,
"r_rows": 510071,
"r_total_time_ms": 3091.2,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "tt.a = 1",
"using_index_for_group_by": "scanning"
}
}
}
}
}
}
}
}
型
update 1:我尝试从mariadb导入数据到mysql 8。三个sql显示没有明显的区别。
update 2:解释mysql 8中的分析结果
EXPLAIN ANALYZE select count(DISTINCT c,d,e),b
from tt where a = 1 group by b;
-> Group aggregate: count(distinct tt.c,tt.d,tt.e) (cost=71893.42 rows=253768) (actual time=0.882..3453.691 rows=16 loops=1)
-> Covering index lookup on tt using tt_a_IDX (a=1) (cost=46516.62 rows=253768) (actual time=0.030..411.063 rows=510114 loops=1)
EXPLAIN ANALYZE select sum(count), b
from
(
select b, COUNT(DISTINCT e) as count
from tt where a = 1 GROUP BY b, c, d
) temp
group by b;
-> Table scan on <temporary> (actual time=1113.183..1113.185 rows=16 loops=1)
-> Aggregate using temporary table (actual time=1113.182..1113.182 rows=16 loops=1)
-> Table scan on temp (cost=97270.23..100444.82 rows=253768) (actual time=1111.180..1111.563 rows=2652 loops=1)
-> Materialize (cost=97270.22..97270.22 rows=253768) (actual time=1111.177..1111.177 rows=2652 loops=1)
-> Group aggregate: count(distinct tt.e) (cost=71893.42 rows=253768) (actual time=0.153..1109.965 rows=2652 loops=1)
-> Covering index lookup on tt using tt_a_IDX (a=1) (cost=46516.62 rows=253768) (actual time=0.043..398.208 rows=510114 loops=1)
EXPLAIN ANALYZE select sum(count), b
from
(
select b, COUNT(DISTINCT e) as count
from tt where a = 1 GROUP BY a,b, c, d
) temp
group by b;
-> Table scan on <temporary> (actual time=4762.307..4762.309 rows=16 loops=1)
-> Aggregate using temporary table (actual time=4762.306..4762.306 rows=16 loops=1)
-> Table scan on temp (cost=76130.41..79305.00 rows=253768) (actual time=4760.310..4760.708 rows=2652 loops=1)
-> Materialize (cost=76130.40..76130.40 rows=253768) (actual time=4760.308..4760.308 rows=2652 loops=1)
-> Group aggregate: count(distinct tt.e) (cost=50753.60 rows=253768) (actual time=0.388..4757.575 rows=2652 loops=1)
-> Filter: (tt.a = 1) (cost=25376.80 rows=253768) (actual time=0.055..4593.786 rows=510071 loops=1)
-> Covering index skip scan for deduplication on tt using tt_a_IDX over (a = 1) (cost=25376.80 rows=253768) (actual time=0.051..4535.554 rows=510071 loops=1)
型
更新3:数据分发
select count(distinct a) from tt;
count(distinct a)|
-----------------+
1|
mark: there is only one value for col a, which is 1.
select count(distinct a,b) from tt;
count(distinct a,b)|
-------------------+
16|
select count(distinct a,b,c) from tt;
count(distinct a,b,c)|
---------------------+
28|
select count(distinct a,b,c,d) from tt;
count(distinct a,b,c,d)|
-----------------------+
2652|
select count(distinct a,b,c,d,e) from tt;
count(distinct a,b,c,d,e)|
-------------------------+
510071|
select count(distinct a,b,c,d,e,f) from tt;
count(distinct a,b,c,d,e,f)|
---------------------------+
49680|
型
1条答案
按热度按时间wfauudbj1#
q1中查询
COUNT(DISTINCT ..)
的实现维护了遍历索引时遇到的元素的树,并在最后计算结果。当按顺序遍历这样的列表时,只需要考虑与前一个元素的差异。这可以更好地实现,我写了一个功能请求MDEV-32870。一个现有的功能请求MDEV-10922也存在,它适用于一个主键(它可以丢弃任何内部变量并视为
COUNT(*)
)。其他查询设法一次生成更小的集合,因此设法更快。