PostgreSQL

Optimizer

Scenario

CREATE TABLE a (id int); // one trillion rows
CREATE TABLE b (id int); // two trillion rows
CREATE TABLE c (id int); // five trillion rows

CREATE INDEX idx_a ON a (id);
CREATE INDEX idx_b ON b (id);
CREATE INDEX idx_c ON c (id);
SELECT * FROM a, b WHERE a.id = b.id AND b.id = 14

Query

Nest Loop

for (x in table1) {
  for (y in table2) {
    if (x.id === y.id) return true;
    
    return false;
  }
}

O(n^2)

Heavy memory footprint

Hash Join

(Hash small table key, reduce memory footprint)

Merge Join

Sorted Index Required

1

2

5

8

9

11

14

20

3

5

7

9

14

25

30

40

O(n * log(n))

EXPLAIN SELECT * FROM a, b WHERE a.id = b.id AND b.id = 14

Optimize

SELECT * FROM a WHERE a.id = 14
SELECT * FROM b WHERE b.id = 14
    ---------------- Query Plan ----------------
Nested Loop (cost=.......)
  -> Index Only Scan using idx_a on a (cost=.......)
          Index Cond: (id = 14)
  -> Index Only Scan using idx_b on b (cost=.......)
          Index Cond: (id = 14)

Constant

EXPLAIN SELECT * FROM a WHERE id = 5 + 8;

    ---------------- Query Plan ----------------
Index Only Scan using idx_a on a (cost=.......)
    Index Cond: (id = 13)
EXPLAIN SELECT * FROM a WHERE id + 5 = 13;

    ---------------- Query Plan ----------------
Seq Scan on a (cost=.......)
    Filter: ((id + 5) = 13)

But

Functions

CREATE OR REPLACE FUNCTION log2(int)
RETURNS numeric AS
$$
  SELECT log(2, $1);
$$
LANGUAGE 'sql' IMMUTABLE;
CREATE FUNCTION
SELECT log2(1024);
       
       log2
---------------------
10.0000000000000000
(1 row)
CREATE INDEX idx_log2 ON a (log2(id));
EXPLAIN SELECT * FROM a WHERE log2(id) = 10;

    ---------------- Query Plan ----------------
Index Scan using idx_log2 on a (cost=.......)
	Index Cond: (log('2'::numeric, (id)::numeric) = '10'::numeric)
EXPLAIN SELECT * FROM a WHERE log(2, id) = 10;

    ---------------- Query Plan ----------------
Index Scan using idx_log2 on a (cost=.......)
	Index Cond: (log('2'::numeric, (id)::numeric) = '10'::numeric)

Also works on

Cut Joins

CREATE TABLE x (id int, PRIMARY KEY (id));
CREATE TABLE y (id int, PRIMARY KEY (id));
EXPLAIN SELECT * FROM x LEFT JOIN y ON (x.id = y.id) WHERE x.id = 12;

    ---------------- Query Plan ----------------
Nested Loop Left Join (cost...)
	Join Filter: (x.id = y.id)
    -> Index Only Scan using x_pkey on x (cost...)
    	Index Cond: (id = 12)
    -> Index Only Scan using y_pkey on y (cost...)
    	Index Cond: (id = 12)

Reduced

EXPLAIN SELECT x.* FROM x LEFT JOIN y ON (x.id = y.id) WHERE x.id = 12;

    ---------------- Query Plan ----------------
Nested Loop Left Join (cost...)
	Join Filter: (x.id = y.id)
    -> Index Only Scan using x_pkey on x (cost...)
    	Index Cond: (id = 12)

Union

EXPLAIN SELECT * 
FROM (
  SELECT id AS xid FROM a
  UNION ALL
  SELECT id FROM b
) AS union_tbl
WHERE xid = 4;

    ---------------- Query Plan ----------------
Append (cost...)
  -> Index Only Scan using idx_a on a (cost...)
    	Index Cond: (id = 4)
  -> Index Only Scan using idx_b on b (cost...)
    	Index Cond: (id = 4)

Union

EXPLAIN SELECT * 
FROM (
  SELECT id AS xid FROM a
  UNION
  SELECT id FROM b
) AS union_tbl
WHERE xid = 4;

  	---------------- Query Plan ----------------
Unique (cost...)
  -> Sort (cost...)
  	 Sort Key: a.id
	 -> Append (cost...)
		 -> Index Only Scan using idx_a on a (cost...)
			 Index Cond: (id = 4)
		 -> Index Only Scan using idx_b on b (cost...)
			 Index Cond: (id = 4)

Find Issues

Analyze

EXPLAIN (analyze, verbose, costs, timing, buffers)
SELECT * FROM a ORDER BY random();

--------------------- Query Plan ---------------------
Sort (cost=2935...2955)
	(actual time=4.364..16.162, rows...)
  Output: id, (random())
  Sort Key: (random())
  Sort Method: quicksort Memory: 913kB
  Buffers: shared hit=49
  -> Seq Scan on public.a
  		(cost=0.00..182.00 rows...)
        (actual time=0.014..3.42 rows...)
        Output: id, random()
        Buffers: shared hit=49
Planning time: 0.041 ms
Execution time: 8.235 ms

Seq scan: 3.42 ms

Sort: 12.742(16.162-3.42) ms

Made with Slides.com