From 39c5fbb6335b79ab77b06f572e11e71db3e1e43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Tue, 19 May 2026 23:11:19 +0200 Subject: [PATCH] Add extended statistics on search(key, value) for planner accuracy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The search table is an EAV schema where key and value are highly correlated, but standard single-column statistics can't model this. This caused the planner to underestimate compound filter queries by up to 1300x, choosing materialize-then-sort plans instead of LIMIT short-circuit index scans. CREATE STATISTICS captures the (key, value) correlation via dependencies, ndistinct, and most-common-values metadata. The planner then correctly estimates row counts and chooses the index-driven plan. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Fredrik Adelöw --- .changeset/search-extended-statistics.md | 5 ++ ...260519000000_search_extended_statistics.js | 87 +++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 .changeset/search-extended-statistics.md create mode 100644 plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js diff --git a/.changeset/search-extended-statistics.md b/.changeset/search-extended-statistics.md new file mode 100644 index 0000000000..88c913802e --- /dev/null +++ b/.changeset/search-extended-statistics.md @@ -0,0 +1,5 @@ +--- +'@backstage/plugin-catalog-backend': patch +--- + +Added extended multi-column statistics on `(key, value)` in the `search` table (PostgreSQL only). This tells the query planner about the correlation between the `key` and `value` columns, fixing severe row count estimation errors on compound filter queries. Without this, the planner could choose to materialize and sort thousands of rows instead of using the LIMIT short-circuit index scan — causing 10-40x slower catalog list views when multiple filters are active. diff --git a/plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js b/plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js new file mode 100644 index 0000000000..ede68413cb --- /dev/null +++ b/plugins/catalog-backend/migrations/20260519000000_search_extended_statistics.js @@ -0,0 +1,87 @@ +/* + * Copyright 2026 The Backstage Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// @ts-check + +/** + * Creates extended multi-column statistics on (key, value) in the search + * table. These statistics capture the correlation between `key` and `value` + * columns, which the planner cannot infer from standard single-column + * statistics alone. Without them, compound filter queries like + * `WHERE key='kind' AND value='component'` get wildly underestimated + * (e.g. 13 estimated vs 17,000 actual rows), causing the planner to + * choose materialize-then-sort plans instead of LIMIT-short-circuit + * index scans. + * + * ## What this creates + * + * On PostgreSQL 10+: + * CREATE STATISTICS search_key_value_stats (dependencies, ndistinct, mcv) + * ON key, value FROM search; + * + * - `dependencies`: tells the planner that `value` depends on `key` + * - `ndistinct`: number of distinct (key, value) combinations + * - `mcv`: most common (key, value) pairs with their actual frequencies + * + * ## Cost + * + * - **Creation**: `CREATE STATISTICS` is metadata-only (instant). + * - **ANALYZE**: reads a sample of the table (~30k rows by default) to + * compute the statistics. Takes 2-4 seconds on a 14M-row table. This + * happens once on migration and then automatically during regular + * autovacuum analyze cycles. + * - **Storage**: a few KB in `pg_statistic_ext_data` — negligible. + * - **Maintenance**: autovacuum refreshes the statistics during its + * regular `ANALYZE` passes, just like single-column statistics. + * No manual maintenance needed. + * + * MySQL and SQLite do not support extended statistics; this migration + * is a no-op on those engines. + */ + +/** + * @param {import('knex').Knex} knex + */ +exports.up = async function up(knex) { + if (!knex.client.config.client.includes('pg')) { + return; + } + + const exists = await knex.raw( + `SELECT 1 FROM pg_statistic_ext WHERE stxname = 'search_key_value_stats'`, + ); + + if (exists.rows.length > 0) { + return; + } + + await knex.raw( + `CREATE STATISTICS search_key_value_stats (dependencies, ndistinct, mcv) ON key, value FROM search`, + ); + + await knex.raw(`ANALYZE search`); +}; + +/** + * @param {import('knex').Knex} knex + */ +exports.down = async function down(knex) { + if (!knex.client.config.client.includes('pg')) { + return; + } + + await knex.raw(`DROP STATISTICS IF EXISTS search_key_value_stats`); +};