);
ID列被作为优化随机查询的选择列.唯一,有索引,非空是比较好的选择.
测试记录:
insert into tbl_user select generate_series(0,1000000),'Huang','Kyle','bsmart',28;
随机查询方法举例:
方法 1. 最简单的随机查询,查询出1条记录。
kyle=# select * from tbl_user order by random() limit 1;
id | firstname | lastname | corp | age
-------+-----------+----------+--------+-----
18205 | Huang | Kyle | bsmart | 28
(1 row)
Time: 595.469 ms
执行计划:
kyle=# explain select * from tbl_user order by random() limit 1;
QUERY PLAN
-------------------------------------------------------------------------------
Limit (cost=24853.02..24853.02 rows=1 width=28)
-> Sort (cost=24853.02..27353.02 rows=1000001 width=28)
Sort Key: (random())
-> Seq Scan on tbl_user (cost=0.00..19853.01 rows=1000001 width=28)
(4 rows)
Time: 22.317 ms
简单是简单,缺点也很明显,这种随机查询在大表上跑数据库肯定是吃不消的。
以下列举利用索引列进行优化的方法。
方法 2. 随机取出n条记录,以下取出5条随机记录
kyle=# SELECT * from tbl_user
kyle-# where id in
kyle-# (select floor(random()*(max_id-min_id))::int
kyle(# + min_id
kyle(# from generate_series(1,5),
kyle(# (select max(id) as max_id,
kyle(# min(id) as min_id
kyle(# from tbl_user) s1
kyle(# )
kyle-# limit 5;
id | firstname | lastname | corp | age
--------+-----------+----------+--------+-----
451661 | Huang | Kyle | bsmart | 28
99977 | Huang | Kyle | bsmart | 28
197446 | Huang | Kyle | bsmart | 28
268740 | Huang | Kyle | bsmart | 28
854540 | Huang | Kyle | bsmart | 28
(5 rows)
Time: 1.599 ms
执行计划:
kyle=# explain SELECT * from tbl_user
(select floor(random()*(max_id-min_id))::int
+ min_id
from generate_series(1,5),
(select max(id) as max_id,
min(id) as min_id
from tbl_user) s1
)
limit 5;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=50.08..50.09 rows=5 width=28)
-> Nested Loop (cost=50.08..832.26 rows=500000 width=28)
-> HashAggregate (cost=50.08..52.08 rows=200 width=8)
-> Nested Loop (cost=0.06..37.58 rows=1000 width=16)
-> Result (cost=0.06..0.07 rows=1 width=0)
InitPlan 1 (returns $0)
-> Limit (cost=0.00..0.03 rows=1 width=8)
-> Index Scan Backward using tbl_user_pkey on tbl_user (cost=0.00..27844.32 rows=1000001 width=8)
Index Cond: (id IS NOT NULL)
InitPlan 2 (returns $1)
-> Limit (cost=0.00..0.03 rows=1 width=8)
-> Index Scan using tbl_user_pkey on tbl_user (cost=0.00..27844.32 rows=1000001 width=8)
Index Cond: (id IS NOT NULL)
-> Function Scan on generate_series (cost=0.00..10.00 rows=1000 width=0)
-> Index Scan using tbl_user_pkey on tbl_user (cost=0.00..3.89 rows=1 width=28)
Index Cond: (public.tbl_user.id = (((floor((random() * ((($0) - ($1)))::double precision)))::integer + ($1))))
(16 rows)
Time: 1.789 ms
方法 3. 取出N条连续的随机记录.(此处用到函数)
kyle=# create or replace function f_get_random (i_range int) returns setof record as $BODY$
declare
kyle$# declare
kyle$# v_result record;
kyle$# v_max_id int;
kyle$# v_min_id int;
kyle$# v_random numeric;
kyle$# begin
kyle$# select random() into v_random;
kyle$# select max(id),min(id) into v_max_id,v_min_id from tbl_user;
kyle$# for v_result in select * from tbl_user where id between (v_min_id+(v_random*(v_max_id-v_min_id))::int) and (v_min_id+(v_random*(v_max_id-v_min_id))::int+i_range)
kyle$# loop
kyle$# return next v_result;
kyle$# end loop;
kyle$# return;
kyle$# end
kyle$# $BODY$ language plpgsql;
CREATE FUNCTION
Time: 86.076 ms
以下举例取出10条连续的随机记录
kyle=# select * from f_get_random(9) as (id bigint,firstname varchar(32),lastname varchar(32),corp varchar(32),age smallint);
id | firstname | lastname | corp | age
--------+-----------+----------+--------+-----
716349 | Huang | Kyle | bsmart | 28
716350 | Huang | Kyle | bsmart | 28
716351 | Huang | Kyle | bsmart | 28
716352 | Huang | Kyle | bsmart | 28
716353 | Huang | Kyle | bsmart | 28
716354 | Huang | Kyle | bsmart | 28
716355 | Huang | Kyle | bsmart | 28
716356 | Huang | Kyle | bsmart | 28
716357 | Huang | Kyle | bsmart | 28
716358 | Huang | Kyle | bsmart | 28
(10 rows)
Time: 7.023 ms
执行计划:
kyle=# explain select * from tbl_user where id between 694686 and 694695;
QUERY PLAN
-------------------------------------------------------------------------------
Index Scan using tbl_user_pkey on tbl_user (cost=0.00..4.48 rows=9 width=28)
Index Cond: ((id >= 694686) AND (id <= 694695))
(2 rows)
Time: 0.841 ms
kyle=# explain select max(id),min(id) from tbl_user;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------
Result (cost=0.06..0.07 rows=1 width=0)
InitPlan 1 (returns $0)
-> Limit (cost=0.00..0.03 rows=1 width=8)
-> Index Scan Backward using tbl_user_pkey on tbl_user (cost=0.00..27844.32 rows=1000001 width=8)
Index Cond: (id IS NOT NULL)
InitPlan 2 (returns $1)
-> Limit (cost=0.00..0.03 rows=1 width=8)
-> Index Scan using tbl_user_pkey on tbl_user (cost=0.00..27844.32 rows=1000001 width=8)
Index Cond: (id IS NOT NULL)
(9 rows)
Time: 0.971 ms
其他的方法不再一一列举,方法2和方法3可以满足大多数的需求了。开销和运行时间均比方法1下降1000倍以上.
注意事项:
1. 索引列的类型和查询条件的类型必须匹配.
2. random() 取值范围 0.0 到 1.0
3. id between x and y 的写法等同于 id>= x and id<=y .
id BETWEEN SYMMETRIC x and y 的写法等同于 (id >= x and id <= y) or (id >= y and id <= x)
因此两者的执行计划是完全不一样的,如下:
kyle=# explain select * from tbl_user where id BETWEEN SYMMETRIC 3 and 2;
QUERY PLAN
----------------------------------------------------------------------------------
Bitmap Heap Scan on tbl_user (cost=4.63..6.65 rows=1 width=28)
Recheck Cond: (((id >= 3) AND (id <= 2)) OR ((id >= 2) AND (id <= 3)))
-> BitmapOr (cost=4.63..4.63 rows=1 width=0)
-> Bitmap Index Scan on tbl_user_pkey (cost=0.00..2.31 rows=1 width=0)
Index Cond: ((id >= 3) AND (id <= 2))
-> Bitmap Index Scan on tbl_user_pkey (cost=0.00..2.31 rows=1 width=0)
Index Cond: ((id >= 2) AND (id <= 3))
(7 rows)
Time: 1.180 ms
explain select * from tbl_user where id BETWEEN 2 and 3;
QUERY PLAN
-------------------------------------------------------------------------------
Index Scan using tbl_user_pkey on tbl_user (cost=0.00..4.32 rows=1 width=28)
Index Cond: ((id >= 2) AND (id <= 3))
(2 rows)
Time: 0.931 ms