From 39c5fbb6335b79ab77b06f572e11e71db3e1e43a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= <freben@gmail.com>
Date: Tue, 19 May 2026 23:11:19 +0200
Subject: [PATCH] Add extended statistics on search(key, value) for planner
 accuracy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The search table is an EAV schema where key and value are highly
correlated, but standard single-column statistics can't model this.
This caused the planner to underestimate compound filter queries by
up to 1300x, choosing materialize-then-sort plans instead of LIMIT
short-circuit index scans.

CREATE STATISTICS captures the (key, value) correlation via
dependencies, ndistinct, and most-common-values metadata. The planner
then correctly estimates row counts and chooses the index-driven plan.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Fredrik Adelöw <freben@gmail.com>
---
 .changeset/search-extended-statistics.md      |  5 ++
 ...260519000000_search_extended_statistics.js | 87 +++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 .changeset/search-extended-statistics.md
 create mode 100644 plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js

diff --git a/.changeset/search-extended-statistics.md b/.changeset/search-extended-statistics.md
new file mode 100644
index 0000000000..88c913802e
--- /dev/null
+++ b/.changeset/search-extended-statistics.md
@@ -0,0 +1,5 @@
+---
+'@backstage/plugin-catalog-backend': patch
+---
+
+Added extended multi-column statistics on `(key, value)` in the `search` table (PostgreSQL only). This tells the query planner about the correlation between the `key` and `value` columns, fixing severe row count estimation errors on compound filter queries. Without this, the planner could choose to materialize and sort thousands of rows instead of using the LIMIT short-circuit index scan — causing 10-40x slower catalog list views when multiple filters are active.
diff --git a/plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js b/plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js
new file mode 100644
index 0000000000..ede68413cb
--- /dev/null
+++ b/plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2026 The Backstage Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @ts-check
+
+/**
+ * Creates extended multi-column statistics on (key, value) in the search
+ * table. These statistics capture the correlation between `key` and `value`
+ * columns, which the planner cannot infer from standard single-column
+ * statistics alone. Without them, compound filter queries like
+ * `WHERE key='kind' AND value='component'` get wildly underestimated
+ * (e.g. 13 estimated vs 17,000 actual rows), causing the planner to
+ * choose materialize-then-sort plans instead of LIMIT-short-circuit
+ * index scans.
+ *
+ * ## What this creates
+ *
+ * On PostgreSQL 10+:
+ *   CREATE STATISTICS search_key_value_stats (dependencies, ndistinct, mcv)
+ *     ON key, value FROM search;
+ *
+ * - `dependencies`: tells the planner that `value` depends on `key`
+ * - `ndistinct`: number of distinct (key, value) combinations
+ * - `mcv`: most common (key, value) pairs with their actual frequencies
+ *
+ * ## Cost
+ *
+ * - **Creation**: `CREATE STATISTICS` is metadata-only (instant).
+ * - **ANALYZE**: reads a sample of the table (~30k rows by default) to
+ *   compute the statistics. Takes 2-4 seconds on a 14M-row table. This
+ *   happens once on migration and then automatically during regular
+ *   autovacuum analyze cycles.
+ * - **Storage**: a few KB in `pg_statistic_ext_data` — negligible.
+ * - **Maintenance**: autovacuum refreshes the statistics during its
+ *   regular `ANALYZE` passes, just like single-column statistics.
+ *   No manual maintenance needed.
+ *
+ * MySQL and SQLite do not support extended statistics; this migration
+ * is a no-op on those engines.
+ */
+
+/**
+ * @param {import('knex').Knex} knex
+ */
+exports.up = async function up(knex) {
+  if (!knex.client.config.client.includes('pg')) {
+    return;
+  }
+
+  const exists = await knex.raw(
+    `SELECT 1 FROM pg_statistic_ext WHERE stxname = 'search_key_value_stats'`,
+  );
+
+  if (exists.rows.length > 0) {
+    return;
+  }
+
+  await knex.raw(
+    `CREATE STATISTICS search_key_value_stats (dependencies, ndistinct, mcv) ON key, value FROM search`,
+  );
+
+  await knex.raw(`ANALYZE search`);
+};
+
+/**
+ * @param {import('knex').Knex} knex
+ */
+exports.down = async function down(knex) {
+  if (!knex.client.config.client.includes('pg')) {
+    return;
+  }
+
+  await knex.raw(`DROP STATISTICS IF EXISTS search_key_value_stats`);
+};