Created at: 2024-12-02
Sometimes it may sound like a good idea to denormalise a couple of columns so that an index lookup can be performed faster.
But does it actually?
Imagine the following table:
CREATE TABLE normalised (
id SERIAL PRIMARY KEY,
foo varchar,
bar varchar
);
CREATE INDEX idx_foo_bar ON normalised (foo, bar);
And its denormalised version, where foo
and bar
are stored in foobar as
foo:bar
.
CREATE TABLE denormalised (
id SERIAL PRIMARY KEY,
foo varchar,
bar varchar,
foobar varchar
);
CREATE INDEX idx_foobar ON denormalised (foobar);
The normalised version is actually faster for queries like:
SELECT foo, bar FROM normalised where foo='{foo}' and bar ='{bar}';
Instead of the denormalised version
SELECT foobar FROM denormalised where foobar='{foo}:{bar}';
The normalised version has to read less buffers for performing the lookups and is a tidy bit faster.
I built a benchmark script for testing this scenario against a table with 10 million rows:
import time
import re
import random
import psycopg2
# Change these before running locally.
DB_NAME = "composed_idx"
USER = "marcelo.fernandes"
PASSWORD = ""
HOST = "localhost"
PORT = 5415
NUM_OF_ROWS = 10_000_000
INSERT_ROWS_PER_BATCH = 1000_000
NUM_OF_QUERIES = 30_000
# Uncomment these on local dev for tests to run fast.
# NUM_OF_ROWS = 10_000
# INSERT_ROWS_PER_BATCH = 1000
# NUM_OF_QUERIES = 1
def get_cursor_and_connection():
conn = psycopg2.connect(
dbname=DB_NAME,
user=USER,
password=PASSWORD,
host=HOST,
port=PORT,
)
conn.autocommit = True
return conn.cursor(), conn
def create_normalised_table(cursor):
print("- Creating normalised table...")
cursor.execute("""
-- Idempotency for convenience.
DROP TABLE IF EXISTS normalised;
CREATE TABLE normalised (
id SERIAL PRIMARY KEY,
foo varchar,
bar varchar
);
-- Disable autovacuum to not interfere with results.
ALTER TABLE normalised SET (autovacuum_enabled = false);
CREATE INDEX idx_foo_bar ON normalised (foo, bar);
""")
def create_denormalised_table(cursor):
print("- Creating denormalised table...")
cursor.execute("""
-- Idempotency for convenience.
DROP TABLE IF EXISTS denormalised;
CREATE TABLE denormalised (
id SERIAL PRIMARY KEY,
foo varchar,
bar varchar,
foobar varchar UNIQUE
);
-- Disable autovacuum to not interfere with results.
ALTER TABLE denormalised SET (autovacuum_enabled = false);
CREATE INDEX idx_foobar ON denormalised (foobar);
""")
def vacuum_tables(cursor):
print("- Vacuuming normalised table...")
start_time = time.time()
cursor.execute("VACUUM ANALYZE normalised;")
duration = time.time() - start_time
print(f"- Vacuum (normalised) took: {duration:.2f} seconds.")
print("- Vacuuming denormalised table...")
start_time = time.time()
cursor.execute("VACUUM ANALYZE denormalised;")
duration = time.time() - start_time
print(f"- Vacuum (denormalised) took: {duration:.2f} seconds.")
def populate_normalised_table(cursor):
print("- Populating (normalised) table...")
start_time = time.time()
for batch in range(int(NUM_OF_ROWS / INSERT_ROWS_PER_BATCH)):
rows = []
for i in range(INSERT_ROWS_PER_BATCH):
val = i + (batch*INSERT_ROWS_PER_BATCH)
rows.append(f"('{val}', '{val}')")
values = ", ".join(rows)
cursor.execute(f"INSERT INTO normalised (foo, bar) VALUES {values};")
duration = time.time() - start_time
print(f"- Populating (normalised) took: {duration:.2f} seconds.")
def populate_denormalised_table(cursor):
print("- Populating (denormalised) table...")
start_time = time.time()
for batch in range(int(NUM_OF_ROWS / INSERT_ROWS_PER_BATCH)):
rows = []
for i in range(INSERT_ROWS_PER_BATCH):
val = i + (batch*INSERT_ROWS_PER_BATCH)
rows.append(f"('{val}', '{val}', '{val}:{val}')")
values = ", ".join(rows)
cursor.execute(f"INSERT INTO denormalised (foo, bar, foobar) VALUES {values};")
duration = time.time() - start_time
print(f"- Populating (denormalised) took: {duration:.2f} seconds.")
def benchmark_normalised_queries(cursor, values):
print(f"- Benchmarking {NUM_OF_QUERIES:,} queries against normalised table...")
explain_outputs = []
for i in range(NUM_OF_QUERIES):
foo = bar = values[i]
query = f"SELECT foo, bar FROM normalised where foo='{foo}' and bar ='{bar}';"
cursor.execute(f"EXPLAIN (ANALYSE, BUFFERS, costs off, summary off) {query}")
explain_outputs.append(cursor.fetchall())
print_explain_info(explain_outputs)
def benchmark_denormalised_queries(cursor, values):
print(f"- Benchmarking {NUM_OF_QUERIES:,} queries against denormalised table...")
explain_outputs = []
for i in range(NUM_OF_QUERIES):
foo = bar = values[i]
query = f"SELECT foobar FROM denormalised where foobar='{foo}:{bar}';"
cursor.execute(f"EXPLAIN (ANALYSE, BUFFERS, costs off, summary) {query}")
explain_outputs.append(cursor.fetchall())
print_explain_info(explain_outputs)
def benchmark_normalised_queries_foo_only(cursor, values):
print(f"- Benchmarking {NUM_OF_QUERIES:,} foo-only queries against normalised table...")
explain_outputs = []
for i in range(NUM_OF_QUERIES):
foo = bar = values[i]
query = f"SELECT foo, bar FROM normalised where foo='{foo}';"
cursor.execute(f"EXPLAIN (ANALYSE, BUFFERS, costs off, summary off) {query}")
explain_outputs.append(cursor.fetchall())
print_explain_info(explain_outputs)
def benchmark_denormalised_queries_foo_only(cursor, values):
print(f"- Benchmarking {NUM_OF_QUERIES:,} foo-only queries against denormalised table...")
explain_outputs = []
for i in range(NUM_OF_QUERIES):
foo = bar = values[i]
query = f"SELECT foobar FROM denormalised where foobar LIKE '{foo}:%';"
cursor.execute(f"EXPLAIN (ANALYSE, BUFFERS, costs off, summary) {query}")
explain_outputs.append(cursor.fetchall())
print_explain_info(explain_outputs)
def print_explain_info(explain_outputs):
shared_hit = 0
shared_read = 0
shared_written = 0
total_timing = (0, 0)
for explain_output in explain_outputs:
found_buffers = False
for line in explain_output:
line = line[0]
if line.startswith(" Buffers:") and found_buffers is False:
found_buffers = True
# Look for buffer stats in the output using regular expressions
match = re.search(r"shared hit=(\d+)", line)
if match:
shared_hit += int(match.group(1))
match = re.search(r"read=(\d+)", line)
if match:
shared_read += int(match.group(1))
match = re.search(r"written=(\d+)", line)
if match:
shared_written += int(match.group(1))
timing = explain_output[0][0].split("actual time=")[1].split(" ")[0].split("..")
total_timing = (total_timing[0] + float(timing[0]), total_timing[1] + float(timing[1]))
size = len(explain_outputs)
print(f" - average timing = {total_timing[0]/size}..{total_timing[1]/size}")
print(f" - average buffers hit {shared_hit/size}")
print(f" - average buffers read {shared_read/size}")
print(f" - average buffers written {shared_written/size}")
def run_tests():
results = {}
print(
f"\nReport details:\n"
f" - rows in each table: {NUM_OF_ROWS:,}\n"
f" - number of queries to benchmark: {NUM_OF_QUERIES:,}\n"
)
cursor, conn = get_cursor_and_connection()
create_normalised_table(cursor)
populate_normalised_table(cursor)
create_denormalised_table(cursor)
populate_denormalised_table(cursor)
vacuum_tables(cursor)
query_values = [random.randint(0, NUM_OF_ROWS) for _ in range(NUM_OF_QUERIES)]
for i in range(2):
# For a fair measurement, we need to run a second time to see how
# caching would affect the results.
benchmark_normalised_queries(cursor, query_values)
benchmark_denormalised_queries(cursor, query_values)
benchmark_normalised_queries_foo_only(cursor, query_values)
benchmark_denormalised_queries_foo_only(cursor, query_values)
cursor.close()
conn.close()
if __name__ == "__main__":
run_tests()
Here is the result for the report:
Report details:
- rows in each table: 10,000,000
- number of queries to benchmark: 30,000
- Creating normalised table...
- Populating (normalised) table...
- Populating (normalised) took: 136.66 seconds.
- Creating denormalised table...
- Populating (denormalised) table...
- Populating (denormalised) took: 256.72 seconds.
- Vacuuming normalised table...
- Vacuum (normalised) took: 1.37 seconds.
- Vacuuming denormalised table...
- Vacuum (denormalised) took: 1.61 seconds.
- Benchmarking 30,000 queries against normalised table...
- average timing = 0.08297973333333454..0.08315866666666841
- average buffers hit 3.1326666666666667
- average buffers read 0.8673333333333333
- average buffers written 0.3596
- Benchmarking 30,000 queries against denormalised table...
- average timing = 0.09937203333333515..0.09953386666666716
- average buffers hit 4.157966666666667
- average buffers read 0.8420333333333333
- average buffers written 0.03643333333333333
- Benchmarking 30,000 foo-only queries against normalised table...
- average timing = 0.031196766666643467..0.03136039999997724
- average buffers hit 3.1750333333333334
- average buffers read 0.8249666666666666
- average buffers written 0.0
- Benchmarking 30,000 foo-only queries against denormalised table...
- average timing = 373.48803086666567..448.2494357000009
- average buffers hit 16117.526433333333
- average buffers read 66226.47356666667
- average buffers written 0.0
- Benchmarking 30,000 queries against normalised table...
- average timing = 0.04304369999998034..0.04320683333331343
- average buffers hit 3.1661
- average buffers read 0.8339
- average buffers written 0.0
- Benchmarking 30,000 queries against denormalised table...
- average timing = 0.08817753333332631..0.08833266666665922
- average buffers hit 4.155433333333334
- average buffers read 0.8445666666666667
- average buffers written 0.0
- Benchmarking 30,000 foo-only queries against normalised table...
- average timing = 0.018202833333331735..0.018360799999997398
- average buffers hit 3.1735333333333333
- average buffers read 0.8264666666666667
- average buffers written 0.0
- Benchmarking 30,000 foo-only queries against denormalised table...
- average timing = 350.0887793666639..421.00194770000036
- average buffers hit 16121.111933333334
- average buffers read 66222.88806666667
- average buffers written 0.0
As you can see, the denormalised version performs better in every case.