oracle,pg,mysql范围越界的选择率估算:
通过以下方式,防止突然变成1,导致笛卡尔积,当然,越界问题容易变成走索引(如果实际查的数据量大,则需要更新统计信息),另外越界的话还是可能估算
成1的,一般都是估算的很少行。
1)oracle按照比例衰减计算
10053有: Using prorated density: 2.2383e-04 of col #7 as selectivity of out-of-range/non-existent value pred
2)pg的等值越界还是按照等值公式计算选择率,不管是不是越界
范围越界,范围查询依赖于直方图和mcv,如果超过直方图{BANNED}最佳小{BANNED}最佳大值且不在mcv理,按照0.0001(0.01%)选择率计算,没有逐步衰减 ,见代码:selfuncs.c
3) mysql范围越界,越界按照0.1%计算,等值和范围一样
没有直方图,则按默认选择率,不管是否越界
索引的按照index dive,索引统计信息计算
ORACLE范围越界研究:
-
drop table t;
-
create table t as select * from dba_objects;
-
-
-- oracle按条件查询,范围越界按比例衰减计算,防止估算为1导致笛卡尔积等,衰减比例一般很小,那么就偏向于走索引,如果实际行数多,还得更新统计信息
-
-- 这个和分区表收集的行数是0的不同,那个直接估算为1
-
--2019-05-30 3:10:14 2025-03-10 8:36:19 无直方图
-
select COLUMN_NAME,NUM_DISTINCT,
-
RAW_TO_DATE(LOW_VALUE),
-
RAW_TO_DATE(HIGH_VALUE),
-
histogram
-
from dba_tab_col_statistics where table_name='T' and column_name='CREATED' AND OWNER=USER;
-
-
SELECT * FROM T WHERE CREATED > TO_DATE('2025-3-5','YYYY-MM-DD');
-
Elapsed: 00:00:00.11
-
-
Execution Plan
-
----------------------------------------------------------
-
Plan hash value: 1601196873
-
-
--------------------------------------------------------------------------
-
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
-
--------------------------------------------------------------------------
-
| 0 | SELECT STATEMENT | | 186 | 24552 | 399 (1)| 00:00:01 |
-
|* 1 | TABLE ACCESS FULL| T | 186 | 24552 | 399 (1)| 00:00:01 |
-
--------------------------------------------------------------------------
-
-
Predicate Information (identified by operation id):
-
---------------------------------------------------
-
-
1 - filter("CREATED">TO_DATE(' 2025-03-05 00:00:00', 'syyyy-mm-dd
-
hh24:mi:ss'))
-
-
-
SELECT * FROM T WHERE CREATED > TO_DATE('2025-3-10','YYYY-MM-DD');
-
Elapsed: 00:00:00.01
-
-
Execution Plan
-
----------------------------------------------------------
-
Plan hash value: 1601196873
-
-
--------------------------------------------------------------------------
-
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
-
--------------------------------------------------------------------------
-
| 0 | SELECT STATEMENT | | 43 | 5676 | 399 (1)| 00:00:01 |
-
|* 1 | TABLE ACCESS FULL| T | 43 | 5676 | 399 (1)| 00:00:01 |
-
--------------------------------------------------------------------------
-
-
Predicate Information (identified by operation id):
-
---------------------------------------------------
-
-
1 - filter("CREATED">TO_DATE(' 2025-03-10 00:00:00', 'syyyy-mm-dd
-
hh24:mi:ss'))
-
-
-- 越界,不会立马rows=1,按照逐步衰减计算
-
SELECT * FROM T WHERE CREATED > TO_DATE('2025-3-11','YYYY-MM-DD');
-
Elapsed: 00:00:00.01
-
-
Execution Plan
-
----------------------------------------------------------
-
Plan hash value: 1601196873
-
-
--------------------------------------------------------------------------
-
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
-
--------------------------------------------------------------------------
-
| 0 | SELECT STATEMENT | | 43 | 5676 | 399 (1)| 00:00:01 |
-
|* 1 | TABLE ACCESS FULL| T | 43 | 5676 | 399 (1)| 00:00:01 |
-
--------------------------------------------------------------------------
-
-
Predicate Information (identified by operation id):
-
---------------------------------------------------
-
-
1 - filter("CREATED">TO_DATE(' 2025-03-11 00:00:00', 'syyyy-mm-dd
-
hh24:mi:ss'))
-
-
--
-
SELECT * FROM T WHERE CREATED > TO_DATE('2025-3-20','YYYY-MM-DD');
-
-
Elapsed: 00:00:00.01
-
-
Execution Plan
-
----------------------------------------------------------
-
Plan hash value: 1601196873
-
-
--------------------------------------------------------------------------
-
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
-
--------------------------------------------------------------------------
-
| 0 | SELECT STATEMENT | | 42 | 5544 | 399 (1)| 00:00:01 |
-
|* 1 | TABLE ACCESS FULL| T | 42 | 5544 | 399 (1)| 00:00:01 |
-
--------------------------------------------------------------------------
-
-
Predicate Information (identified by operation id):
-
---------------------------------------------------
-
-
1 - filter("CREATED">TO_DATE(' 2025-03-20 00:00:00', 'syyyy-mm-dd
-
hh24:mi:ss'))
-
-
exec dbms_stats.gather_table_stats(ownname => user,tabname =>'t',method_opt=>'for columns created size skewonly',no_invalidate => false);
-
-
--还是按比例衰减计算,这样可以防止笛卡尔积
-
10053有: Using prorated density: 2.2383e-04 of col #7 as selectivity of out-of-range/non-existent value pred
-
-
SELECT * FROM T WHERE CREATED > TO_DATE('2026-9-20','YYYY-MM-DD');
-
-
-
Elapsed: 00:00:00.01
-
-
Execution Plan
-
----------------------------------------------------------
-
Plan hash value: 1601196873
-
-
--------------------------------------------------------------------------
-
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
-
--------------------------------------------------------------------------
-
| 0 | SELECT STATEMENT | | 16 | 2112 | 399 (1)| 00:00:01 |
-
|* 1 | TABLE ACCESS FULL| T | 16 | 2112 | 399 (1)| 00:00:01 |
-
--------------------------------------------------------------------------
-
-
Predicate Information (identified by operation id):
-
---------------------------------------------------
-
-
1 - filter("CREATED">TO_DATE(' 2026-09-20 00:00:00', 'syyyy-mm-dd
-
hh24:mi:ss'))
MySQL范围越界研究:
-
-- mysql范围越界,越界按照0.1%计算,等值和范围一样
-
-- 没有直方图,则按默认选择率,不管是否越界
-
explain select * from emp1 where birth_date>date'2025-1-1';
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
| 1 | SIMPLE | emp1 | NULL | ALL | NULL | NULL | NULL | NULL | 299478 | 33.33 | Using where |
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
1 row in set, 1 warning (0.00 sec)
-
-
--收集直方图,越界按照0.1%计算
-
analyze table emp1 update histogram on birth_date;
-
explain select * from emp1 where birth_date>date'2025-1-1';
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
| 1 | SIMPLE | emp1 | NULL | ALL | NULL | NULL | NULL | NULL | 299478 | 0.10 | Using where |
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
1 row in set, 1 warning (0.01 sec)
-
-
explain select * from emp1 where birth_date=date'2025-1-1';
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
-
+----+-------------+-------+------------+------+---------------+------+---------+------+--------+----------+-------------+
-
| 1 | SIMPLE | emp1 | NULL | ALL | NULL | NULL | NULL | NULL | 299478 | 0.10 | Using where |
-
+----+-------------+-------+------------+------+------------
PG范围越界研究:
--pg里没有考虑等值越界,等值选择率计算依赖于mvc和n_distinct,在mcv里按照mcv,不在mcv里,按照去掉mcv和null_frac计算平均选择率,所以越界的可能不准
--范围越界的按照选择率0.0001计算
--如果导致执行计划问题,可能需要hints,一般按照不在mcv里的计算,等值条件返回的应该值比较小。
--所以虽然where id = 1200000越界了,但是计算公式还是和上面的id=101一样,按照不在mcv里的计算,估算返回行数=8
-
explain
-
select * from mcv_test mt
-
where id = 1200000;
-
QUERY PLAN
-
-------------------------------------------------------------
-
Seq Scan on mcv_test mt (cost=0.00..152.50 rows=8 width=8)
-
Filter: (id = 1200000)
-
(2 rows)
-
-
select * from pg_stats ps where ps.tablename='mcv_test' and ps.attname='id';
-
--100到1000,当然,这个也可以直接看出来
-
select min(hist_bounds),max(hist_bounds)
-
from pg_stats ps,unnest(histogram_bounds::text::int[]) as tmp(hist_bounds)
-
where ps.tablename='mcv_test' and ps.attname='id';
-
-
-
--范围越界,范围查询依赖于直方图和mcv,如果超过直方图{BANNED}最佳小{BANNED}最佳大值且不在mcv理,按照0.0001(0.01%)选择率计算,没有逐步衰减 ,见代码:selfuncs.c
-
-
--如果表很大,有的值没有采样到,也不在直方图和mcv范围内,说明直方图100不足,可以增加列statistics或者更新统计信息,特别是日期列
-
alter table mcv_test alter id set statistics 300; --这个是按照n*300随机采样行,一般100不足,设300到500也够了
-
-
-
---pg,设1000个值,100个则都在mcv里,越界按照0.01%计算,等值的还是按照等值公式计算
-
--无直方图,只按照MCV的,范围越界则按照1计算,等值按照1/num_distinct计算
-
--有直方图,越界按照0.0001计算选择率,像oracle会按照逐步衰减按比例计算,pg和mysql没有
-
create table ddate1 as
-
WITH recursive t(n,n2,dt,mon) AS (
-
SELECT 1, 2,'2023-6-1' ::timestamp,'2023-6-1'::timestamp
-
UNION ALL
-
SELECT t.n+1
-
,t.n2+2
-
,mon + INTERVAL '1 day'
-
,mon + INTERVAL '1 MONTH'
-
FROM t
-
WHERE t.n<1000
-
)
-
SELECT * FROM T;
-
-
--多执行几次,这个在MCV里
-
insert into ddate1 select * from ddate1;
-
-
analyze ddate1;
-
-
--32000行
-
select * from pg_class where relname='ddate1';
-
select * from pg_stats where tablename='ddate1' and attname='dt';
-
-
--1000
-
select n_distinct from pg_stats where tablename='ddate1' and attname='dt';
-
-
select count(*),max(dt),min(dt)
-
from (
-
select unnest(histogram_bounds::text::timestamp[]) dt from pg_stats where tablename='ddate1' and attname='dt'
-
);
-
count | max | min
-
-------+---------------------+---------------------
-
101 | 2106-08-02 00:00:00 | 2023-06-01 00:00:00
-
(1 row)
-
-
-
select min(dt),max(dt),count(*) from ddate1;
-
min | max | count
-
---------------------+---------------------+-------
-
2023-06-01 00:00:00 | 2106-08-02 00:00:00 | 32000
-
-
--pg的等值选择率在MCV按MCV估算,不在按照ndv估算,不考虑越界问题
-
explain select * from ddate1 where dt = '2023-6-1';
-
QUERY PLAN
-
---------------------------------------------------------------------
-
Seq Scan on ddate1 (cost=0.00..605.00 rows=32 width=24)
-
Filter: (dt = '2023-06-01 00:00:00'::timestamp without time zone)
-
(2 rows)
-
-
--范围按照直方图考虑越界,越界后选择率为默认的0.0001,比如下面的越界,估算行数=32000*0.0001=3,
-
--见代码:selfuncs.c
-
explain select * from ddate1 where dt < '2023-6-1';
-
QUERY PLAN
-
---------------------------------------------------------------------
-
Seq Scan on ddate1 (cost=0.00..605.00 rows=3 width=24)
-
Filter: (dt < '2023-06-01 00:00:00'::timestamp without time zone)
-
(2 rows)
-
-
-
explain select * from ddate1 where dt >= '2106-12-3';
-
QUERY PLAN
-
----------------------------------------------------------------------
-
Seq Scan on ddate1 (cost=0.00..605.00 rows=3 width=24)
-
Filter: (dt >= '2106-12-03 00:00:00'::timestamp without time zone)
-
(2 rows)