From 7445f0f5bca55181f86b587563cd0ef7365d9f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Sun, 10 May 2026 14:14:26 +0200 Subject: [PATCH 01/16] draft(catalog-backend): search table dedup, covering indices, UNIQUE constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single knex migration that cleans up duplicate search rows and creates covering indices including a UNIQUE constraint on (entity_id, key, value). Each step is idempotent and handles INVALID indices from interrupted runs. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Fredrik Adelöw --- .changeset/search-indices-and-dedup.md | 33 ++ ...20260510000000_search_indices_and_dedup.js | 282 ++++++++++++++++++ .../operations/stitcher/buildEntitySearch.ts | 13 +- .../stitcher/syncSearchRows.test.ts | 17 +- .../operations/stitcher/syncSearchRows.ts | 18 +- .../service/DefaultEntitiesCatalog.test.ts | 58 ++-- 6 files changed, 382 insertions(+), 39 deletions(-) create mode 100644 .changeset/search-indices-and-dedup.md create mode 100644 plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js diff --git a/.changeset/search-indices-and-dedup.md b/.changeset/search-indices-and-dedup.md new file mode 100644 index 0000000000..cfa1e8dc3b --- /dev/null +++ b/.changeset/search-indices-and-dedup.md @@ -0,0 +1,33 @@ +--- +'@backstage/plugin-catalog-backend': patch +--- + +Added a migration that removes duplicate rows from the `search` table, creates covering indices for improved query performance, and adds a `UNIQUE` constraint on `(entity_id, key, value)`. + +This is a long-running migration on large catalogs. On PostgreSQL with millions of search rows, the index creation may take 5-15 minutes. During this time, other pods running the previous version will continue to serve traffic normally — the index creation does not block reads or writes. If the pod is restarted during the migration, the next startup will clean up and retry automatically. + +**For large installations**, you can run the following SQL commands against your PostgreSQL database before deploying to avoid the startup delay. If these have already completed, the migration will detect the existing indices and skip all work. + +```sql +-- Step 1: Remove duplicate search rows +WITH cte AS ( + SELECT ctid, row_number() OVER (PARTITION BY entity_id, key, value) AS rn + FROM search +) +DELETE FROM search USING cte WHERE search.ctid = cte.ctid AND cte.rn > 1; + +-- Step 2: Create new indices (run each separately) +CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS + search_entity_key_value_idx ON search (entity_id, key, value); +CREATE INDEX CONCURRENTLY IF NOT EXISTS + search_key_value_entity_idx ON search (key, value, entity_id); +CREATE INDEX CONCURRENTLY IF NOT EXISTS + search_facets_covering_idx ON search (key, original_value, entity_id) + WHERE original_value IS NOT NULL; + +-- Step 3: Drop old indices that are no longer needed +DROP INDEX CONCURRENTLY IF EXISTS search_key_value_idx; +DROP INDEX CONCURRENTLY IF EXISTS search_key_original_value_idx; +``` + +Also fixed `buildEntitySearch` to remove duplicate output for entities with duplicate array values, and added `ON CONFLICT DO UPDATE` to `syncSearchRows` so that concurrent stitching races are handled gracefully. diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js new file mode 100644 index 0000000000..e60e39a017 --- /dev/null +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -0,0 +1,282 @@ +/* + * Copyright 2026 The Backstage Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// @ts-check + +/** + * Deduplicates the search table, adds covering indices (including a UNIQUE + * constraint on (entity_id, key, value)), and drops superseded indices. + * + * On PostgreSQL this uses CREATE INDEX CONCURRENTLY which avoids blocking + * reads/writes but can take several minutes on large tables (13M+ rows). + * The migration is fully rerun-safe: each step checks the current state + * and skips work that is already done or cleans up INVALID indices left by + * a previous interrupted attempt. + * + * ## Important note for large installations + * + * If your search table has millions of rows, this migration may take + * 5-15 minutes. During this time the pod will appear unready. Other pods + * running the previous version of Backstage will continue to serve + * traffic normally — the index creation does not block reads or writes. + * + * To avoid the startup delay, you can run the following commands against + * your PostgreSQL database BEFORE deploying this version: + * + * -- 1. Remove duplicate search rows + * WITH cte AS ( + * SELECT ctid, + * row_number() OVER (PARTITION BY entity_id, key, value) AS rn + * FROM search + * ) + * DELETE FROM search USING cte + * WHERE search.ctid = cte.ctid AND cte.rn > 1; + * + * -- 2. Create indices (run each separately) + * CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS + * search_entity_key_value_idx ON search (entity_id, key, value); + * CREATE INDEX CONCURRENTLY IF NOT EXISTS + * search_key_value_entity_idx ON search (key, value, entity_id); + * CREATE INDEX CONCURRENTLY IF NOT EXISTS + * search_facets_covering_idx ON search (key, original_value, entity_id) + * WHERE original_value IS NOT NULL; + * + * -- 3. Drop old indices + * DROP INDEX CONCURRENTLY IF EXISTS search_key_value_idx; + * DROP INDEX CONCURRENTLY IF EXISTS search_key_original_value_idx; + * + * If these commands have already completed, the migration will detect the + * existing indices and skip all work — startup will be instant. + */ + +/** + * @param {import('knex').Knex} knex + */ +exports.up = async function up(knex) { + const client = knex.client.config.client; + + if (client.includes('pg')) { + await upPostgres(knex); + } else if (client.includes('mysql')) { + await upMysql(knex); + } else { + await upSqlite(knex); + } +}; + +/** + * @param {import('knex').Knex} knex + */ +exports.down = async function down(_knex) { + // Intentionally a no-op. The old non-covering indices are not worth + // recreating, and the UNIQUE constraint should not be removed since the + // code now relies on ON CONFLICT for correctness. +}; + +exports.config = { transaction: false }; + +// --------------------------------------------------------------------------- +// PostgreSQL +// --------------------------------------------------------------------------- + +/** @param {import('knex').Knex} knex */ +async function upPostgres(knex) { + // Step 1: Remove duplicate search rows. The window function's + // PARTITION BY treats NULLs as equal, so this handles both NULL-value + // and non-NULL-value duplicates. Idempotent: deletes 0 rows if clean. + await knex.raw(` + WITH cte AS ( + SELECT ctid, + row_number() OVER (PARTITION BY entity_id, key, value) AS rn + FROM search + ) + DELETE FROM search + USING cte + WHERE search.ctid = cte.ctid AND cte.rn > 1 + `); + + // Step 2: Create covering indices. Each call is idempotent — it checks + // the index state and only does work if needed. + await ensurePgIndex(knex, { + name: 'search_entity_key_value_idx', + columns: '(entity_id, key, value)', + unique: true, + }); + await ensurePgIndex(knex, { + name: 'search_key_value_entity_idx', + columns: '(key, value, entity_id)', + unique: false, + }); + await ensurePgIndex(knex, { + name: 'search_facets_covering_idx', + columns: '(key, original_value, entity_id)', + where: 'WHERE original_value IS NOT NULL', + unique: false, + }); + + // Step 3: Drop superseded indices. + await dropPgIndexIfExists(knex, 'search_key_value_idx'); + await dropPgIndexIfExists(knex, 'search_key_original_value_idx'); +} + +/** + * Creates or replaces an index on the search table, handling all edge cases: + * - Already valid with correct uniqueness: skip + * - Exists but INVALID (interrupted CREATE): drop and recreate + * - Exists but wrong uniqueness (e.g. non-unique but we need unique): drop and recreate + * - Missing: create from scratch + * + * @param {import('knex').Knex} knex + * @param {{ name: string; columns: string; unique: boolean; where?: string }} opts + */ +async function ensurePgIndex(knex, opts) { + const { name, columns, unique, where } = opts; + + const result = await knex.raw( + ` + SELECT indisunique, indisvalid + FROM pg_index + WHERE indexrelid = ( + SELECT oid FROM pg_class WHERE relname = ? + ) + `, + [name], + ); + + if (result.rows.length > 0) { + const { indisunique, indisvalid } = result.rows[0]; + if (indisvalid && indisunique === unique) { + return; // Already correct + } + // Wrong state — drop and recreate + await knex.raw(`DROP INDEX CONCURRENTLY IF EXISTS "${name}"`); + } + + const uniqueKw = unique ? 'UNIQUE' : ''; + const whereClause = where || ''; + await knex.raw( + `CREATE ${uniqueKw} INDEX CONCURRENTLY "${name}" ON search ${columns} ${whereClause}`, + ); +} + +/** + * @param {import('knex').Knex} knex + * @param {string} name + */ +async function dropPgIndexIfExists(knex, name) { + const result = await knex.raw( + ` + SELECT 1 FROM pg_class WHERE relname = ? + `, + [name], + ); + if (result.rows.length > 0) { + await knex.raw(`DROP INDEX CONCURRENTLY IF EXISTS "${name}"`); + } +} + +// --------------------------------------------------------------------------- +// MySQL +// --------------------------------------------------------------------------- + +/** @param {import('knex').Knex} knex */ +async function upMysql(knex) { + // Dedup via temp table + await knex.transaction(async trx => { + await trx.raw( + 'CREATE TEMPORARY TABLE IF NOT EXISTS `_search_keep` (' + + '`entity_id` VARCHAR(255), `key` VARCHAR(255), ' + + '`value` VARCHAR(255), `original_value` VARCHAR(255))', + ); + await trx.raw('DELETE FROM `_search_keep`'); + await trx.raw( + 'INSERT INTO `_search_keep` ' + + 'SELECT `entity_id`, `key`, `value`, MAX(`original_value`) ' + + 'FROM `search` GROUP BY `entity_id`, `key`, `value`', + ); + await trx.raw('DELETE FROM `search`'); + await trx.raw( + 'INSERT INTO `search` (`entity_id`, `key`, `value`, `original_value`) ' + + 'SELECT * FROM `_search_keep`', + ); + await trx.raw('DROP TEMPORARY TABLE `_search_keep`'); + }); + + // Drop old indices if present, then create new ones + await mysqlDropIndexIfExists(knex, 'search_key_value_idx'); + await mysqlDropIndexIfExists(knex, 'search_key_original_value_idx'); + await mysqlDropIndexIfExists(knex, 'search_entity_key_value_idx'); + await mysqlDropIndexIfExists(knex, 'search_key_value_entity_idx'); + await mysqlDropIndexIfExists(knex, 'search_facets_covering_idx'); + + await knex.schema.alterTable('search', table => { + table.unique(['entity_id', 'key', 'value'], 'search_entity_key_value_idx'); + table.index(['key', 'value', 'entity_id'], 'search_key_value_entity_idx'); + }); + // MySQL doesn't support partial indices — create a full one + await knex.schema.alterTable('search', table => { + table.index( + ['key', 'original_value', 'entity_id'], + 'search_facets_covering_idx', + ); + }); +} + +/** @param {import('knex').Knex} knex @param {string} name */ +async function mysqlDropIndexIfExists(knex, name) { + const [rows] = await knex.raw( + `SHOW INDEX FROM \`search\` WHERE Key_name = ?`, + [name], + ); + if (rows.length > 0) { + await knex.schema.alterTable('search', t => { + t.dropIndex([], name); + }); + } +} + +// --------------------------------------------------------------------------- +// SQLite +// --------------------------------------------------------------------------- + +/** @param {import('knex').Knex} knex */ +async function upSqlite(knex) { + await knex.transaction(async trx => { + await trx.raw(` + DELETE FROM search + WHERE rowid NOT IN ( + SELECT MIN(rowid) FROM search GROUP BY entity_id, key, value + ) + `); + }); + + // Drop old, create new — SQLite is fast on small tables + await knex.raw('DROP INDEX IF EXISTS search_key_value_idx'); + await knex.raw('DROP INDEX IF EXISTS search_key_original_value_idx'); + await knex.raw('DROP INDEX IF EXISTS search_entity_key_value_idx'); + await knex.raw('DROP INDEX IF EXISTS search_key_value_entity_idx'); + await knex.raw('DROP INDEX IF EXISTS search_facets_covering_idx'); + + await knex.raw( + 'CREATE UNIQUE INDEX search_entity_key_value_idx ON search (entity_id, key, value)', + ); + await knex.raw( + 'CREATE INDEX search_key_value_entity_idx ON search (key, value, entity_id)', + ); + await knex.raw( + 'CREATE INDEX search_facets_covering_idx ON search (key, original_value, entity_id)', + ); +} diff --git a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts index c8e09f3347..96b74e19bf 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts @@ -228,5 +228,16 @@ export function buildEntitySearch( ); } - return mapToRows(raw, entityId); + const rows = mapToRows(raw, entityId); + + // Deduplicate by (key, value). Duplicate array values in the entity data + // (e.g. tags: ['java', 'java']) produce identical search rows which would + // violate the unique constraint on (entity_id, key, value). + const seen = new Set(); + return rows.filter(row => { + const k = `${row.key}\0${row.value ?? ''}`; + if (seen.has(k)) return false; + seen.add(k); + return true; + }); } diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts index 0168464faa..49e67be736 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts @@ -190,20 +190,17 @@ describe.each(databases.eachSupportedId())('syncSearchRows, %p', databaseId => { ); }); - it('inserts a row when only original_value casing differs from existing', async () => { - // Two rows with the same (key, value) but different original_value - // casing must coexist — the case-insensitive MySQL collation must not - // cause the INSERT to skip the second row. + it('keeps one row when original_value casing differs for same key+value', async () => { + // Two entries with the same lowercased (key, value) but different + // original_value casing are deduplicated — the UNIQUE constraint on + // (entity_id, key, value) allows only one. The last occurrence wins. await syncSearchRows(knex, 'e1', [row('a', 'v', 'V')]); await syncSearchRows(knex, 'e1', [row('a', 'v', 'V'), row('a', 'v', 'v')]); const rows = await getSearchRows(); - expect(rows).toHaveLength(2); - expect(rows).toEqual( - expect.arrayContaining([ - expect.objectContaining({ key: 'a', value: 'v', original_value: 'V' }), - expect.objectContaining({ key: 'a', value: 'v', original_value: 'v' }), - ]), + expect(rows).toHaveLength(1); + expect(rows[0]).toEqual( + expect.objectContaining({ key: 'a', value: 'v', original_value: 'v' }), ); }); diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts index 18969b4f33..17f7bf1135 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts @@ -48,14 +48,24 @@ export async function syncSearchRows( entityId: string, searchEntries: DbSearchRow[], ): Promise { + // Dedup by (key, value) — the UNIQUE constraint on (entity_id, key, value) + // rejects duplicates, and the same lowercased value with different original + // casing is semantically a single entry. Keep the last occurrence so that + // if the entity data has e.g. tags: ['Java', 'java'], the later one wins. + const dedupMap = new Map(); + for (const entry of searchEntries) { + dedupMap.set(`${entry.key}\0${entry.value ?? ''}`, entry); + } + const deduped = [...dedupMap.values()]; + const client = knex.client.config.client; if (client === 'pg') { - await syncPostgres(knex, entityId, searchEntries); + await syncPostgres(knex, entityId, deduped); } else if (client.includes('mysql')) { - await syncMysql(knex, entityId, searchEntries); + await syncMysql(knex, entityId, deduped); } else { - await syncBulkReplace(knex, entityId, searchEntries); + await syncBulkReplace(knex, entityId, deduped); } } @@ -110,6 +120,8 @@ async function syncPostgres( AND COALESCE(s.value, chr(1)) = COALESCE(d.value, chr(1)) AND COALESCE(s.original_value, chr(1)) = COALESCE(d.original_value, chr(1)) ) + ON CONFLICT (entity_id, key, value) + DO UPDATE SET original_value = EXCLUDED.original_value `, [keys, values, originalValues, entityId, entityId, entityId], ); diff --git a/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts b/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts index 2bc70a6e94..7abb3ca8a3 100644 --- a/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts +++ b/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts @@ -2013,22 +2013,26 @@ describe('DefaultEntitiesCatalog', () => { await Promise.all(entities.map(e => addEntityToSearch(e))); - // Manually insert duplicate search entries for the same entities - // I'm not sure exactly how this happens but I have seen it in the real world - await knex('search').insert([ - { - entity_id: 'uid-a', - key: 'metadata.title', - value: 'a test entity', - original_value: 'A Test Entity', - }, - { - entity_id: 'uid-b', - key: 'metadata.title', - value: 'b test entity', - original_value: 'B Test Entity', - }, - ]); + // The UNIQUE constraint on (entity_id, key, value) prevents + // duplicate search rows. Verify that duplicates are silently + // rejected and the query still returns correct results. + await knex('search') + .insert([ + { + entity_id: 'uid-a', + key: 'metadata.title', + value: 'a test entity', + original_value: 'A Test Entity', + }, + { + entity_id: 'uid-b', + key: 'metadata.title', + value: 'b test entity', + original_value: 'B Test Entity', + }, + ]) + .onConflict(['entity_id', 'key', 'value']) + .ignore(); const catalog = new DefaultEntitiesCatalog({ database: knex, @@ -2407,15 +2411,19 @@ describe('DefaultEntitiesCatalog', () => { spec: {}, }); - // Manually insert a duplicate search entry, this shouldn't happen but does in reality - await knex('search').insert([ - { - entity_id: 'uid-a', - key: 'metadata.name', - value: 'one', - original_value: 'one', - }, - ]); + // Attempt to insert a duplicate — the UNIQUE constraint silently + // rejects it via ON CONFLICT IGNORE. + await knex('search') + .insert([ + { + entity_id: 'uid-a', + key: 'metadata.name', + value: 'one', + original_value: 'one', + }, + ]) + .onConflict(['entity_id', 'key', 'value']) + .ignore(); const catalog = new DefaultEntitiesCatalog({ database: knex, From 27d2d3f0a281c919269a188c32944a05db7e9a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Sun, 10 May 2026 14:21:19 +0200 Subject: [PATCH 02/16] fix down migration + update SQL report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The down migration now restores the previous index state (drops new indices, recreates the old search_key_value_idx and search_key_original_value_idx). Updated the SQL report to reflect the new index set. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Fredrik Adelöw --- ...20260510000000_search_indices_and_dedup.js | 43 +++++++++++++++++-- plugins/catalog-backend/report.sql.md | 5 ++- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index e60e39a017..623c5bbfce 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -80,10 +80,45 @@ exports.up = async function up(knex) { /** * @param {import('knex').Knex} knex */ -exports.down = async function down(_knex) { - // Intentionally a no-op. The old non-covering indices are not worth - // recreating, and the UNIQUE constraint should not be removed since the - // code now relies on ON CONFLICT for correctness. +exports.down = async function down(knex) { + const client = knex.client.config.client; + + if (client.includes('pg')) { + // Remove the new indices and restore the old ones + await knex.raw( + 'DROP INDEX CONCURRENTLY IF EXISTS search_entity_key_value_idx', + ); + await knex.raw( + 'DROP INDEX CONCURRENTLY IF EXISTS search_key_value_entity_idx', + ); + await knex.raw( + 'DROP INDEX CONCURRENTLY IF EXISTS search_facets_covering_idx', + ); + await knex.raw( + 'CREATE INDEX CONCURRENTLY IF NOT EXISTS search_key_value_idx ON search (key, value)', + ); + await knex.raw( + 'CREATE INDEX CONCURRENTLY IF NOT EXISTS search_key_original_value_idx ON search (key, original_value)', + ); + } else if (client.includes('mysql')) { + await mysqlDropIndexIfExists(knex, 'search_entity_key_value_idx'); + await mysqlDropIndexIfExists(knex, 'search_key_value_entity_idx'); + await mysqlDropIndexIfExists(knex, 'search_facets_covering_idx'); + await knex.schema.alterTable('search', table => { + table.index(['key', 'value'], 'search_key_value_idx'); + table.index(['key', 'original_value'], 'search_key_original_value_idx'); + }); + } else { + await knex.raw('DROP INDEX IF EXISTS search_entity_key_value_idx'); + await knex.raw('DROP INDEX IF EXISTS search_key_value_entity_idx'); + await knex.raw('DROP INDEX IF EXISTS search_facets_covering_idx'); + await knex.raw( + 'CREATE INDEX IF NOT EXISTS search_key_value_idx ON search (key, value)', + ); + await knex.raw( + 'CREATE INDEX IF NOT EXISTS search_key_original_value_idx ON search (key, original_value)', + ); + } }; exports.config = { transaction: false }; diff --git a/plugins/catalog-backend/report.sql.md b/plugins/catalog-backend/report.sql.md index e23b250be8..0ea80db3f4 100644 --- a/plugins/catalog-backend/report.sql.md +++ b/plugins/catalog-backend/report.sql.md @@ -127,8 +127,9 @@ ### Indices - `search_entity_id_idx` (`entity_id`) -- `search_key_original_value_idx` (`key`, `original_value`) -- `search_key_value_idx` (`key`, `value`) +- `search_entity_key_value_idx` (`entity_id`, `key`, `value`) unique +- `search_facets_covering_idx` (`key`, `original_value`, `entity_id`) +- `search_key_value_entity_idx` (`key`, `value`, `entity_id`) ## Table `stitch_queue` From c8efd9dda5837701e21b7df3834924b6caba45bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Sun, 10 May 2026 19:16:32 +0200 Subject: [PATCH 03/16] clarify death loop risk in migration comments and changeset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Be explicit that interrupted index builds do not leave partial progress — each retry starts from scratch. On large tables with short liveness probe timeouts this repeats indefinitely. Recommend running the SQL commands manually before deploying. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Fredrik Adelöw --- .changeset/search-indices-and-dedup.md | 4 ++-- ...20260510000000_search_indices_and_dedup.js | 24 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.changeset/search-indices-and-dedup.md b/.changeset/search-indices-and-dedup.md index cfa1e8dc3b..58e370a319 100644 --- a/.changeset/search-indices-and-dedup.md +++ b/.changeset/search-indices-and-dedup.md @@ -4,9 +4,9 @@ Added a migration that removes duplicate rows from the `search` table, creates covering indices for improved query performance, and adds a `UNIQUE` constraint on `(entity_id, key, value)`. -This is a long-running migration on large catalogs. On PostgreSQL with millions of search rows, the index creation may take 5-15 minutes. During this time, other pods running the previous version will continue to serve traffic normally — the index creation does not block reads or writes. If the pod is restarted during the migration, the next startup will clean up and retry automatically. +This is a long-running migration on large catalogs. On PostgreSQL with millions of search rows, the index creation may take 5-15 minutes per index. During this time, other pods running the previous version will continue to serve traffic normally — the index creation does not block reads or writes. However, if a Kubernetes liveness probe kills the pod before the index build completes, the build is lost and the next startup will start over. On large tables this can repeat indefinitely. -**For large installations**, you can run the following SQL commands against your PostgreSQL database before deploying to avoid the startup delay. If these have already completed, the migration will detect the existing indices and skip all work. +**For large installations**, it is recommended to run the following SQL commands against your PostgreSQL database **before deploying** this version. Each index build takes a few minutes but does not block reads or writes. If these have already completed, the migration will detect the existing indices and skip all work — startup will be instant. ```sql -- Step 1: Remove duplicate search rows diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index 623c5bbfce..10528a5347 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -22,19 +22,21 @@ * * On PostgreSQL this uses CREATE INDEX CONCURRENTLY which avoids blocking * reads/writes but can take several minutes on large tables (13M+ rows). - * The migration is fully rerun-safe: each step checks the current state - * and skips work that is already done or cleans up INVALID indices left by - * a previous interrupted attempt. + * Each step is idempotent: it checks the current state and skips work + * already done, or cleans up INVALID indices left by an interrupted + * attempt before retrying. However, an interrupted index build does NOT + * leave partial progress — each retry starts from scratch. If the + * Kubernetes liveness probe kills the pod before an index build completes, + * the next startup will drop the INVALID index and restart the build. On + * large tables this can repeat indefinitely. To prevent this, either + * increase the liveness probe timeout for this one-time upgrade, or run + * the SQL commands below manually before deploying. * - * ## Important note for large installations + * ## Recommended: run manually before deploying (large installations) * - * If your search table has millions of rows, this migration may take - * 5-15 minutes. During this time the pod will appear unready. Other pods - * running the previous version of Backstage will continue to serve - * traffic normally — the index creation does not block reads or writes. - * - * To avoid the startup delay, you can run the following commands against - * your PostgreSQL database BEFORE deploying this version: + * For PostgreSQL installations with millions of search rows, run these + * commands against your database BEFORE deploying this version. Each + * index build takes a few minutes but does not block reads or writes. * * -- 1. Remove duplicate search rows * WITH cte AS ( From bb6c46adc08f3eecd8ba7190854ba6b00547c956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 12:50:02 +0200 Subject: [PATCH 04/16] fix(catalog-backend): create replacement indices before dropping in down migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoids a window with no coverage on (key, value) during rollback. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- ...20260510000000_search_indices_and_dedup.js | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index 10528a5347..f49b36d731 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -86,7 +86,14 @@ exports.down = async function down(knex) { const client = knex.client.config.client; if (client.includes('pg')) { - // Remove the new indices and restore the old ones + // Restore the old indices first so there is no window without coverage, + // then drop the new ones. + await knex.raw( + 'CREATE INDEX CONCURRENTLY IF NOT EXISTS search_key_value_idx ON search (key, value)', + ); + await knex.raw( + 'CREATE INDEX CONCURRENTLY IF NOT EXISTS search_key_original_value_idx ON search (key, original_value)', + ); await knex.raw( 'DROP INDEX CONCURRENTLY IF EXISTS search_entity_key_value_idx', ); @@ -96,30 +103,24 @@ exports.down = async function down(knex) { await knex.raw( 'DROP INDEX CONCURRENTLY IF EXISTS search_facets_covering_idx', ); - await knex.raw( - 'CREATE INDEX CONCURRENTLY IF NOT EXISTS search_key_value_idx ON search (key, value)', - ); - await knex.raw( - 'CREATE INDEX CONCURRENTLY IF NOT EXISTS search_key_original_value_idx ON search (key, original_value)', - ); } else if (client.includes('mysql')) { - await mysqlDropIndexIfExists(knex, 'search_entity_key_value_idx'); - await mysqlDropIndexIfExists(knex, 'search_key_value_entity_idx'); - await mysqlDropIndexIfExists(knex, 'search_facets_covering_idx'); await knex.schema.alterTable('search', table => { table.index(['key', 'value'], 'search_key_value_idx'); table.index(['key', 'original_value'], 'search_key_original_value_idx'); }); + await mysqlDropIndexIfExists(knex, 'search_entity_key_value_idx'); + await mysqlDropIndexIfExists(knex, 'search_key_value_entity_idx'); + await mysqlDropIndexIfExists(knex, 'search_facets_covering_idx'); } else { - await knex.raw('DROP INDEX IF EXISTS search_entity_key_value_idx'); - await knex.raw('DROP INDEX IF EXISTS search_key_value_entity_idx'); - await knex.raw('DROP INDEX IF EXISTS search_facets_covering_idx'); await knex.raw( 'CREATE INDEX IF NOT EXISTS search_key_value_idx ON search (key, value)', ); await knex.raw( 'CREATE INDEX IF NOT EXISTS search_key_original_value_idx ON search (key, original_value)', ); + await knex.raw('DROP INDEX IF EXISTS search_entity_key_value_idx'); + await knex.raw('DROP INDEX IF EXISTS search_key_value_entity_idx'); + await knex.raw('DROP INDEX IF EXISTS search_facets_covering_idx'); } }; From 930c575e5838d8b4c0f9372b91846d8408e11523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 12:55:10 +0200 Subject: [PATCH 05/16] test(catalog-backend): simplify onConflict().ignore() and add UNIQUE constraint test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the explicit column list from .onConflict() calls in test setup — ON CONFLICT DO NOTHING without a target is equivalent since (entity_id, key, value) is the only unique constraint on the search table. Add a dedicated test in syncSearchRows that directly inserts a duplicate row and verifies it is silently rejected via the UNIQUE constraint. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../stitcher/syncSearchRows.test.ts | 20 +++++++++++++++++++ .../service/DefaultEntitiesCatalog.test.ts | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts index 49e67be736..0c7ac6f95a 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts @@ -224,6 +224,26 @@ describe.each(databases.eachSupportedId())('syncSearchRows, %p', databaseId => { expect(rows.map(r => r.value).sort()).toEqual(['java', 'python', 'rust']); }); + it('silently rejects a direct duplicate insert via the UNIQUE constraint', async () => { + await syncSearchRows(knex, 'e1', [row('a', 'x'), row('b', 'y')]); + + // Simulate a concurrent process inserting a duplicate directly — + // the UNIQUE constraint on (entity_id, key, value) should reject it + // silently. ON CONFLICT without a column list covers any unique + // constraint on the table, which is safe since (entity_id, key, value) + // is the only one. + await knex('search') + .insert({ entity_id: 'e1', key: 'a', value: 'x', original_value: 'x' }) + .onConflict() + .ignore(); + + const rows = await getSearchRows(); + expect(rows).toHaveLength(2); + expect(rows.find(r => r.key === 'a')).toEqual( + expect.objectContaining({ key: 'a', value: 'x' }), + ); + }); + it('simulates the typical steady-state case with one changed row', async () => { // Build a realistic-ish set of search rows const initial = [ diff --git a/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts b/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts index 7abb3ca8a3..427a9c8ce0 100644 --- a/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts +++ b/plugins/catalog-backend/src/service/DefaultEntitiesCatalog.test.ts @@ -2031,7 +2031,7 @@ describe('DefaultEntitiesCatalog', () => { original_value: 'B Test Entity', }, ]) - .onConflict(['entity_id', 'key', 'value']) + .onConflict() .ignore(); const catalog = new DefaultEntitiesCatalog({ @@ -2422,7 +2422,7 @@ describe('DefaultEntitiesCatalog', () => { original_value: 'one', }, ]) - .onConflict(['entity_id', 'key', 'value']) + .onConflict() .ignore(); const catalog = new DefaultEntitiesCatalog({ From 63036124ccf5d620f965206c67e75fa812c5109a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 13:33:01 +0200 Subject: [PATCH 06/16] test(catalog-backend): add ON CONFLICT DO UPDATE coverage for original_value overwrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verifies that a conflicting insert with a different original_value casing updates the stored value, and documents that DO UPDATE requires explicit conflict columns unlike DO NOTHING. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../operations/stitcher/syncSearchRows.test.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts index 0c7ac6f95a..9733f4361b 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts @@ -244,6 +244,24 @@ describe.each(databases.eachSupportedId())('syncSearchRows, %p', databaseId => { ); }); + it('overwrites original_value on conflict via ON CONFLICT DO UPDATE', async () => { + await syncSearchRows(knex, 'e1', [row('a', 'x', 'X')]); + + // Simulate a concurrent stitcher inserting the same (entity_id, key, value) + // with a different original_value casing. Unlike DO NOTHING, DO UPDATE + // requires an explicit conflict target — the column list is not optional. + await knex('search') + .insert({ entity_id: 'e1', key: 'a', value: 'x', original_value: 'x' }) + .onConflict(['entity_id', 'key', 'value']) + .merge(['original_value']); + + const rows = await getSearchRows(); + expect(rows).toHaveLength(1); + expect(rows[0]).toEqual( + expect.objectContaining({ key: 'a', value: 'x', original_value: 'x' }), + ); + }); + it('simulates the typical steady-state case with one changed row', async () => { // Build a realistic-ish set of search rows const initial = [ From f503fc935baf77477f36f7e4c3543e8e0f87ad1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 13:42:06 +0200 Subject: [PATCH 07/16] fix(catalog-backend): scope pg_class lookups to relkind='i' in migration helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents a false match if any non-index object (table, sequence, view) shares a name with one of the search indices. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../migrations/20260510000000_search_indices_and_dedup.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index f49b36d731..bdf36d7a3f 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -188,7 +188,7 @@ async function ensurePgIndex(knex, opts) { SELECT indisunique, indisvalid FROM pg_index WHERE indexrelid = ( - SELECT oid FROM pg_class WHERE relname = ? + SELECT oid FROM pg_class WHERE relname = ? AND relkind = 'i' ) `, [name], @@ -217,7 +217,7 @@ async function ensurePgIndex(knex, opts) { async function dropPgIndexIfExists(knex, name) { const result = await knex.raw( ` - SELECT 1 FROM pg_class WHERE relname = ? + SELECT 1 FROM pg_class WHERE relname = ? AND relkind = 'i' `, [name], ); From 36514a23d6b51f8818384bf4cfe7b8f3b41a6082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 16:52:44 +0200 Subject: [PATCH 08/16] fix(catalog-backend): use two-phase dedup in search migration to avoid full-table re-scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original single-pass CTE window-function DELETE re-scanned every row on each invocation — including the idempotent "nothing to do" case which took over 4 minutes on a 14 M-row table, and timed out the DB proxy on the dirty case (~217 s for 722 k duplicates). Replace it with two improvements: 1. Fast path: if the UNIQUE index already exists and is valid, skip dedup entirely. A valid unique index is a proof-of-no-duplicates. This makes migration startup essentially free for installations that prepare the index manually beforehand (as documented in the migration comment). 2. Two-phase dedup when dedup is needed: Phase 1 does one full-table scan to collect all duplicate ctids into a temp table (~60 s on 14 M rows). Phase 2 drains that temp table in 10 k-row batches via cheap ctid lookups with no further full-table scans (~5 s for 722 k rows). Total ~65 s dirty vs >217 s+ before, and ~35 ms clean vs 4+ minutes before. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- ...20260510000000_search_indices_and_dedup.js | 91 ++++++++++++++++--- 1 file changed, 77 insertions(+), 14 deletions(-) diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index bdf36d7a3f..79ae34d36a 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -32,13 +32,29 @@ * increase the liveness probe timeout for this one-time upgrade, or run * the SQL commands below manually before deploying. * + * ## Deduplication strategy + * + * The dedup step is skipped entirely if the UNIQUE index already exists and + * is valid — a valid unique index guarantees no duplicates can be present. + * + * When dedup is needed the migration uses a two-phase approach to avoid + * the O(N) full-table re-scan that a plain window-function CTE would + * require on every batch iteration: + * + * Phase 1 (one scan): collect all duplicate ctids into a temp table. + * Phase 2 (no re-scan): drain the temp table in small batches using + * cheap ctid lookups — no further full scans. + * * ## Recommended: run manually before deploying (large installations) * * For PostgreSQL installations with millions of search rows, run these * commands against your database BEFORE deploying this version. Each * index build takes a few minutes but does not block reads or writes. + * The migration detects that the indices already exist and skips all work, + * so startup is instant. * - * -- 1. Remove duplicate search rows + * -- 1. Remove duplicate search rows (adjust LIMIT / loop as needed for + * -- very large tables to avoid long-running single transactions) * WITH cte AS ( * SELECT ctid, * row_number() OVER (PARTITION BY entity_id, key, value) AS rn @@ -132,19 +148,66 @@ exports.config = { transaction: false }; /** @param {import('knex').Knex} knex */ async function upPostgres(knex) { - // Step 1: Remove duplicate search rows. The window function's - // PARTITION BY treats NULLs as equal, so this handles both NULL-value - // and non-NULL-value duplicates. Idempotent: deletes 0 rows if clean. - await knex.raw(` - WITH cte AS ( - SELECT ctid, - row_number() OVER (PARTITION BY entity_id, key, value) AS rn - FROM search - ) - DELETE FROM search - USING cte - WHERE search.ctid = cte.ctid AND cte.rn > 1 - `); + // Step 1: Remove duplicate search rows. + // + // Fast path: if the UNIQUE index already exists and is valid, Postgres has + // been enforcing uniqueness since the index was created, so there are no + // duplicates. Skip dedup entirely — this makes restarts essentially free + // for installations that created the index manually beforehand. + // + // Slow path (index absent or invalid): use a two-phase approach that scans + // the table exactly once. + // Phase 1: one full scan to collect every duplicate ctid into a temp + // table. The window function's PARTITION BY treats NULLs as + // equal, so NULL-value duplicates are handled correctly. + // Phase 2: drain the temp table in small batches. Each iteration is a + // cheap ctid lookup with no further full-table scans. + const uniqueCheck = await knex.raw( + `SELECT indisvalid + FROM pg_index + WHERE indexrelid = ( + SELECT oid FROM pg_class WHERE relname = ? AND relkind = 'i' + ) AND indisunique = true`, + ['search_entity_key_value_idx'], + ); + const needsDedup = !uniqueCheck.rows[0]?.indisvalid; + + if (needsDedup) { + // Phase 1: one scan — populate temp table with all duplicate ctids. + await knex.raw(` + CREATE TEMP TABLE _search_dedup_batch AS + SELECT ctid AS dup_ctid + FROM ( + SELECT ctid, + row_number() OVER (PARTITION BY entity_id, key, value ORDER BY ctid) AS rn + FROM search + ) sub + WHERE rn > 1 + `); + await knex.raw(`CREATE INDEX ON _search_dedup_batch (dup_ctid)`); + + // Phase 2: drain the temp table, deleting from search in small batches. + // The inner CTE deletes a chunk from the temp table and returns the ctids; + // the outer DELETE removes those rows from search. PostgreSQL guarantees + // all data-modifying CTEs execute exactly once regardless of whether + // their output is consumed, so both DELETEs always fire. + for (;;) { + const { rows } = await knex.raw(` + WITH batch AS ( + DELETE FROM _search_dedup_batch + WHERE dup_ctid IN (SELECT dup_ctid FROM _search_dedup_batch LIMIT 10000) + RETURNING dup_ctid + ), + removed AS ( + DELETE FROM search USING batch WHERE search.ctid = batch.dup_ctid + ) + SELECT count(*) AS n FROM batch + `); + if (Number(rows[0].n) === 0) break; + } + + await knex.raw('DROP TABLE IF EXISTS _search_dedup_batch'); + } // Step 2: Create covering indices. Each call is idempotent — it checks // the index state and only does work if needed. From c7706249ba23cc11bb3f0dd35d687360570d3771 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 17:07:49 +0200 Subject: [PATCH 09/16] fix(catalog-backend): use explicit null sentinel in dedup keys and add buildEntitySearch dedup tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use `=== null ? '\x01' : value` instead of `?? ''` in both buildEntitySearch and syncSearchRows dedup maps, so that null and empty-string values are treated as distinct keys. In theory an entity could produce both value=null and value='' for the same key (e.g. spec.foo: [null, '']), and the old encoding would silently drop one of the two distinct rows. Also adds two focused unit tests to buildEntitySearch.test.ts: one covering deduplication of duplicate array values (e.g. tags: ['java', 'java', 'Java']), and one confirming that null and empty-string are kept as separate rows. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../stitcher/buildEntitySearch.test.ts | 35 +++++++++++++++++++ .../operations/stitcher/buildEntitySearch.ts | 2 +- .../operations/stitcher/syncSearchRows.ts | 10 ++++-- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.test.ts index 5a9e634e35..b4903d3144 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.test.ts @@ -255,6 +255,41 @@ describe('buildEntitySearch', () => { ]); }); + it('deduplicates rows from duplicate array values', () => { + const rows = buildEntitySearch('eid', { + apiVersion: 'a', + kind: 'b', + metadata: { name: 'n', tags: ['java', 'java', 'Java'] }, + }); + // 'java' and 'Java' both normalise to value 'java'; all three occurrences + // should collapse into a single row (first-wins for original_value). + const tagRows = rows.filter(r => r.key === 'metadata.tags'); + expect(tagRows).toHaveLength(1); + expect(tagRows[0]).toEqual( + expect.objectContaining({ value: 'java', original_value: 'java' }), + ); + // The synthetic boolean path key (metadata.tags.java) should also appear + // exactly once. + const boolRows = rows.filter(r => r.key === 'metadata.tags.java'); + expect(boolRows).toHaveLength(1); + }); + + it('treats null and empty-string values as distinct for deduplication', () => { + // An array with both null and '' for the same key must produce two rows + // since value=null and value='' are distinct in the database. + const rows = buildEntitySearch('eid', { + apiVersion: 'a', + kind: 'b', + metadata: { name: 'n' }, + spec: { foo: [null, ''] }, + }); + const fooRows = rows.filter(r => r.key === 'spec.foo'); + expect(fooRows).toHaveLength(2); + const values = fooRows.map(r => r.value); + expect(values).toContain(null); + expect(values).toContain(''); + }); + it('rejects duplicate keys', () => { expect(() => buildEntitySearch('eid', { diff --git a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts index 96b74e19bf..bbee57c84f 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts @@ -235,7 +235,7 @@ export function buildEntitySearch( // violate the unique constraint on (entity_id, key, value). const seen = new Set(); return rows.filter(row => { - const k = `${row.key}\0${row.value ?? ''}`; + const k = `${row.key}\0${row.value === null ? '\x01' : row.value}`; if (seen.has(k)) return false; seen.add(k); return true; diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts index 17f7bf1135..dce65b4109 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts @@ -50,11 +50,15 @@ export async function syncSearchRows( ): Promise { // Dedup by (key, value) — the UNIQUE constraint on (entity_id, key, value) // rejects duplicates, and the same lowercased value with different original - // casing is semantically a single entry. Keep the last occurrence so that - // if the entity data has e.g. tags: ['Java', 'java'], the later one wins. + // casing is semantically a single entry. Keep the first occurrence, which + // matches the first-wins semantics of buildEntitySearch so that both layers + // consistently pick the same original_value for a given input order. const dedupMap = new Map(); for (const entry of searchEntries) { - dedupMap.set(`${entry.key}\0${entry.value ?? ''}`, entry); + const k = `${entry.key}\0${entry.value === null ? '\x01' : entry.value}`; + if (!dedupMap.has(k)) { + dedupMap.set(k, entry); + } } const deduped = [...dedupMap.values()]; From fbae43b4d05ca0cbe660d13d7ecb442c4a677c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 18:27:11 +0200 Subject: [PATCH 10/16] catalog-backend: use index-only GROUP BY + LATERAL for dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the window-function full-table scan with a two-phase approach that leverages search_key_value_entity_idx (key, value, entity_id): Phase 1: GROUP BY entity_id, key, value with HAVING COUNT(*) > 1 resolves as a pure index-only scan (Heap Fetches: 0). Stores only the duplicate groups in a temp table (~8s for a 14M-row table with 700k dupes). Phase 2: CROSS JOIN LATERAL back into search using the same covering index (Nested Loop + Index Scan). row_number() runs per-group over the 2-3 matching rows, so there is no global external sort. A single DELETE removes all extras in one statement (~16s). NULL values get a separate UNION ALL arm so the index equality condition stays usable (value = NULL is always false in SQL). Benchmarked on a 14.4M-row production-like staging master with 700k injected duplicates, post-VACUUM: Old (seq scan + external merge sort): ~101s New (index-only + index scan): ~25s (~4× faster) Clean second run (no dupes, fast path skips dedup): <50ms. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- ...20260510000000_search_indices_and_dedup.js | 151 ++++++++++++------ 1 file changed, 100 insertions(+), 51 deletions(-) diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index 79ae34d36a..c72c2b3298 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -37,13 +37,19 @@ * The dedup step is skipped entirely if the UNIQUE index already exists and * is valid — a valid unique index guarantees no duplicates can be present. * - * When dedup is needed the migration uses a two-phase approach to avoid - * the O(N) full-table re-scan that a plain window-function CTE would - * require on every batch iteration: + * When dedup is needed the migration uses a two-phase approach that avoids + * a full heap scan by leveraging the pre-existing + * search_key_value_entity_idx (key, value, entity_id) covering index: * - * Phase 1 (one scan): collect all duplicate ctids into a temp table. - * Phase 2 (no re-scan): drain the temp table in small batches using - * cheap ctid lookups — no further full scans. + * Phase 1 (index-only scan): + * GROUP BY entity_id, key, value on the covering index — zero heap + * fetches — to build a small temp table of only the duplicate groups. + * + * Phase 2 (index scan, dup rows only): + * For each duplicate group in the temp table, LATERAL index-scan back + * into search to find the per-group ctids, then DELETE them in one + * statement. Only the ~2× duplicate rows are touched; clean rows are + * never read from the heap. * * ## Recommended: run manually before deploying (large installations) * @@ -53,15 +59,40 @@ * The migration detects that the indices already exist and skips all work, * so startup is instant. * - * -- 1. Remove duplicate search rows (adjust LIMIT / loop as needed for - * -- very large tables to avoid long-running single transactions) - * WITH cte AS ( - * SELECT ctid, - * row_number() OVER (PARTITION BY entity_id, key, value) AS rn + * -- 1. Remove duplicate search rows using the same index-friendly + * -- two-phase strategy used by the migration itself. + * CREATE TEMP TABLE _search_dedup_groups AS + * SELECT entity_id, key, value * FROM search - * ) - * DELETE FROM search USING cte - * WHERE search.ctid = cte.ctid AND cte.rn > 1; + * GROUP BY entity_id, key, value + * HAVING COUNT(*) > 1; + * CREATE INDEX ON _search_dedup_groups (key, value, entity_id); + * + * DELETE FROM search WHERE ctid IN ( + * SELECT s.ctid FROM _search_dedup_groups g + * CROSS JOIN LATERAL ( + * SELECT ctid FROM ( + * SELECT ctid, + * row_number() OVER (ORDER BY ctid) AS rn + * FROM search + * WHERE key = g.key AND entity_id = g.entity_id + * AND value = g.value + * ) sub WHERE rn > 1 + * ) s WHERE g.value IS NOT NULL + * UNION ALL + * SELECT s.ctid FROM _search_dedup_groups g + * CROSS JOIN LATERAL ( + * SELECT ctid FROM ( + * SELECT ctid, + * row_number() OVER (ORDER BY ctid) AS rn + * FROM search + * WHERE key = g.key AND entity_id = g.entity_id + * AND value IS NULL + * ) sub WHERE rn > 1 + * ) s WHERE g.value IS NULL + * ); + * + * DROP TABLE _search_dedup_groups; * * -- 2. Create indices (run each separately) * CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS @@ -155,13 +186,13 @@ async function upPostgres(knex) { // duplicates. Skip dedup entirely — this makes restarts essentially free // for installations that created the index manually beforehand. // - // Slow path (index absent or invalid): use a two-phase approach that scans - // the table exactly once. - // Phase 1: one full scan to collect every duplicate ctid into a temp - // table. The window function's PARTITION BY treats NULLs as - // equal, so NULL-value duplicates are handled correctly. - // Phase 2: drain the temp table in small batches. Each iteration is a - // cheap ctid lookup with no further full-table scans. + // Slow path (index absent or invalid): two-phase approach. + // Phase 1: index-only GROUP BY scan over search_key_value_entity_idx + // (key, value, entity_id) — zero heap fetches — to build a + // temp table of only the duplicate groups. + // Phase 2: LATERAL + index scan to find ctids within each group, then + // a single DELETE. Only the duplicate rows (~2× their count) + // are ever read from the heap; the full table is never scanned. const uniqueCheck = await knex.raw( `SELECT indisvalid FROM pg_index @@ -173,40 +204,58 @@ async function upPostgres(knex) { const needsDedup = !uniqueCheck.rows[0]?.indisvalid; if (needsDedup) { - // Phase 1: one scan — populate temp table with all duplicate ctids. + // Phase 1: index-only GROUP BY scan — no heap fetches. + // search_key_value_entity_idx (key, value, entity_id) covers all three + // dedup columns, so PostgreSQL resolves COUNT(*) without touching the + // heap at all (Heap Fetches: 0 in EXPLAIN). The result is a small temp + // table of only the duplicate (entity_id, key, value) groups. await knex.raw(` - CREATE TEMP TABLE _search_dedup_batch AS - SELECT ctid AS dup_ctid - FROM ( - SELECT ctid, - row_number() OVER (PARTITION BY entity_id, key, value ORDER BY ctid) AS rn - FROM search - ) sub - WHERE rn > 1 + CREATE TEMP TABLE _search_dedup_groups AS + SELECT entity_id, key, value + FROM search + GROUP BY entity_id, key, value + HAVING COUNT(*) > 1 `); - await knex.raw(`CREATE INDEX ON _search_dedup_batch (dup_ctid)`); + await knex.raw( + `CREATE INDEX ON _search_dedup_groups (key, value, entity_id)`, + ); - // Phase 2: drain the temp table, deleting from search in small batches. - // The inner CTE deletes a chunk from the temp table and returns the ctids; - // the outer DELETE removes those rows from search. PostgreSQL guarantees - // all data-modifying CTEs execute exactly once regardless of whether - // their output is consumed, so both DELETEs always fire. - for (;;) { - const { rows } = await knex.raw(` - WITH batch AS ( - DELETE FROM _search_dedup_batch - WHERE dup_ctid IN (SELECT dup_ctid FROM _search_dedup_batch LIMIT 10000) - RETURNING dup_ctid - ), - removed AS ( - DELETE FROM search USING batch WHERE search.ctid = batch.dup_ctid - ) - SELECT count(*) AS n FROM batch - `); - if (Number(rows[0].n) === 0) break; - } + // Phase 2: for each duplicate group, LATERAL-join back into search via + // the covering index (Nested Loop + Index Scan), row_number within that + // tiny per-group result, then DELETE rows where rn > 1. Only the ~2× + // duplicate rows are ever read from the heap; all clean rows are skipped. + // + // NULL values need a separate arm because `value = NULL` is always false + // in SQL — `value IS NULL` is required for the index condition. + await knex.raw(` + DELETE FROM search WHERE ctid IN ( + SELECT s.ctid FROM _search_dedup_groups g + CROSS JOIN LATERAL ( + SELECT ctid FROM ( + SELECT ctid, + row_number() OVER (ORDER BY ctid) AS rn + FROM search + WHERE key = g.key AND entity_id = g.entity_id + AND value = g.value + ) sub WHERE rn > 1 + ) s WHERE g.value IS NOT NULL - await knex.raw('DROP TABLE IF EXISTS _search_dedup_batch'); + UNION ALL + + SELECT s.ctid FROM _search_dedup_groups g + CROSS JOIN LATERAL ( + SELECT ctid FROM ( + SELECT ctid, + row_number() OVER (ORDER BY ctid) AS rn + FROM search + WHERE key = g.key AND entity_id = g.entity_id + AND value IS NULL + ) sub WHERE rn > 1 + ) s WHERE g.value IS NULL + ) + `); + + await knex.raw('DROP TABLE IF EXISTS _search_dedup_groups'); } // Step 2: Create covering indices. Each call is idempotent — it checks From 62c67c752b53e77a81547f08d3694ee0898f4072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 18:41:01 +0200 Subject: [PATCH 11/16] catalog-backend: create covering index before dedup for vanilla installs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The index-only GROUP BY scan in Phase 1 of the dedup requires search_key_value_entity_idx (key, value, entity_id) to exist. Previously it was created after dedup, meaning fresh installs that had never manually run preparatory SQL would fall back to a full sequential scan. Move the ensurePgIndex call for search_key_value_entity_idx to before the dedup step so the fast path is guaranteed for all users, not just those who created the index manually in advance. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- ...20260510000000_search_indices_and_dedup.js | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js index c72c2b3298..1a35b0f080 100644 --- a/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js +++ b/plugins/catalog-backend/migrations/20260510000000_search_indices_and_dedup.js @@ -179,7 +179,18 @@ exports.config = { transaction: false }; /** @param {import('knex').Knex} knex */ async function upPostgres(knex) { - // Step 1: Remove duplicate search rows. + // Step 1: Ensure the covering index exists before deduplication. + // This non-unique index on (key, value, entity_id) covers all three dedup + // columns, enabling an index-only GROUP BY scan with zero heap fetches in + // Phase 1 of the dedup. Creating it here guarantees this is true even on + // vanilla installations that have never run any preparatory SQL manually. + await ensurePgIndex(knex, { + name: 'search_key_value_entity_idx', + columns: '(key, value, entity_id)', + unique: false, + }); + + // Step 2: Remove duplicate search rows. // // Fast path: if the UNIQUE index already exists and is valid, Postgres has // been enforcing uniqueness since the index was created, so there are no @@ -258,18 +269,14 @@ async function upPostgres(knex) { await knex.raw('DROP TABLE IF EXISTS _search_dedup_groups'); } - // Step 2: Create covering indices. Each call is idempotent — it checks - // the index state and only does work if needed. + // Step 3: Create remaining covering indices. Each call is idempotent — + // it checks the index state and only does work if needed. + // search_key_value_entity_idx was already created in Step 1. await ensurePgIndex(knex, { name: 'search_entity_key_value_idx', columns: '(entity_id, key, value)', unique: true, }); - await ensurePgIndex(knex, { - name: 'search_key_value_entity_idx', - columns: '(key, value, entity_id)', - unique: false, - }); await ensurePgIndex(knex, { name: 'search_facets_covering_idx', columns: '(key, original_value, entity_id)', @@ -277,7 +284,7 @@ async function upPostgres(knex) { unique: false, }); - // Step 3: Drop superseded indices. + // Step 4: Drop superseded indices. await dropPgIndexIfExists(knex, 'search_key_value_idx'); await dropPgIndexIfExists(knex, 'search_key_original_value_idx'); } From 9fa8b118c27c7345e3c6947584692e2a99cc0252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 18:42:24 +0200 Subject: [PATCH 12/16] catalog-backend: fix syncSearchRows test to expect first-wins semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the 'keeps one row when original_value casing differs' test to expect original_value: 'V' (first occurrence) instead of 'v' (last occurrence), matching the first-wins dedup semantics that were aligned with buildEntitySearch in a prior commit. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../src/database/operations/stitcher/syncSearchRows.test.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts index 9733f4361b..9cc2af8206 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts @@ -193,14 +193,15 @@ describe.each(databases.eachSupportedId())('syncSearchRows, %p', databaseId => { it('keeps one row when original_value casing differs for same key+value', async () => { // Two entries with the same lowercased (key, value) but different // original_value casing are deduplicated — the UNIQUE constraint on - // (entity_id, key, value) allows only one. The last occurrence wins. + // (entity_id, key, value) allows only one. The first occurrence wins, + // matching the first-wins semantics of buildEntitySearch. await syncSearchRows(knex, 'e1', [row('a', 'v', 'V')]); await syncSearchRows(knex, 'e1', [row('a', 'v', 'V'), row('a', 'v', 'v')]); const rows = await getSearchRows(); expect(rows).toHaveLength(1); expect(rows[0]).toEqual( - expect.objectContaining({ key: 'a', value: 'v', original_value: 'v' }), + expect.objectContaining({ key: 'a', value: 'v', original_value: 'V' }), ); }); From befdb6b31e0a2af68fdd282e696a818cfe750da8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 18:50:49 +0200 Subject: [PATCH 13/16] catalog-backend: add migration test for search_indices_and_dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests two scenarios across all supported databases: Preconditions NOT met (dedup runs): inserts five rows with two duplicate (entity_id, key, value) pairs (one with null value), runs the migration, and verifies that duplicates are removed, null values are handled correctly, and the unique constraint is enforced post-migration. The down migration is verified to drop the constraint so duplicates can be inserted again. Preconditions met (PostgreSQL only, dedup skipped): pre-creates the unique index to simulate a user who ran the manual SQL before deploying, re-runs the migration, and verifies the row count is unchanged — confirming the fast path fires correctly. Non-PostgreSQL databases skip the fast-path test with an early return since the pg_index check is PostgreSQL-specific. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../src/tests/migrations.test.ts | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) diff --git a/plugins/catalog-backend/src/tests/migrations.test.ts b/plugins/catalog-backend/src/tests/migrations.test.ts index 6e067a8126..4b1153f3c6 100644 --- a/plugins/catalog-backend/src/tests/migrations.test.ts +++ b/plugins/catalog-backend/src/tests/migrations.test.ts @@ -1094,6 +1094,164 @@ describe('migrations', () => { }, ); + it.each(databases.eachSupportedId())( + '20260510000000_search_indices_and_dedup.js, %p', + async databaseId => { + const knex = await databases.init(databaseId); + + await migrateUntilBefore( + knex, + '20260510000000_search_indices_and_dedup.js', + ); + + // Set up FK dependencies: search → final_entities → refresh_state + await knex('refresh_state').insert({ + entity_id: 'e1', + entity_ref: 'k:ns/n1', + unprocessed_entity: '{}', + errors: '[]', + next_update_at: knex.fn.now(), + last_discovery_at: knex.fn.now(), + }); + await knex('final_entities').insert({ + entity_id: 'e1', + entity_ref: 'k:ns/n1', + hash: 'h1', + final_entity: '{}', + }); + + // Insert search rows with duplicates on (entity_id, key, value). + // Both copies of k1 share the same original_value so the surviving row + // is unambiguous across all database dedup strategies. + // The two null-value rows verify that null dedup does not conflate null + // with non-null or remove more rows than it should. + await knex('search').insert([ + { entity_id: 'e1', key: 'k1', value: 'v1', original_value: 'v1' }, // first copy + { entity_id: 'e1', key: 'k1', value: 'v1', original_value: 'v1' }, // duplicate — removed + { entity_id: 'e1', key: 'k2', value: null, original_value: null }, // null first — kept + { entity_id: 'e1', key: 'k2', value: null, original_value: null }, // null duplicate — removed + { entity_id: 'e1', key: 'k3', value: 'v3', original_value: 'v3' }, // unique — kept + ]); + + // Preconditions NOT met: dedup runs + await migrateUpOnce(knex); + + // 5 rows collapsed to 3 unique (entity_id, key, value) combinations + const rows = await knex('search').orderBy('key'); + expect(rows).toHaveLength(3); + expect(rows.map(r => r.key)).toEqual(['k1', 'k2', 'k3']); + expect(rows[0]).toEqual( + expect.objectContaining({ + key: 'k1', + value: 'v1', + original_value: 'v1', + }), + ); + // null-value row survives correctly + expect(rows[1]).toEqual( + expect.objectContaining({ + key: 'k2', + value: null, + original_value: null, + }), + ); + + // Unique constraint is now in place + await expect( + knex('search').insert({ + entity_id: 'e1', + key: 'k1', + value: 'v1', + original_value: 'extra', + }), + ).rejects.toEqual(expect.anything()); + + await migrateDownOnce(knex); + + // After rollback the unique constraint is gone — duplicates are allowed again + await expect( + knex('search').insert({ + entity_id: 'e1', + key: 'k1', + value: 'v1', + original_value: 'extra', + }), + ).resolves.not.toThrow(); + + await knex.destroy(); + }, + ); + + it.each(databases.eachSupportedId())( + '20260510000000_search_indices_and_dedup.js preconditions met (PG fast path), %p', + async databaseId => { + const knex = await databases.init(databaseId); + + // The fast path (skip dedup when unique index pre-exists) is a + // PostgreSQL-only code path that checks pg_index. + if (!knex.client.config.client.includes('pg')) { + await knex.destroy(); + return; + } + + await migrateUntilBefore( + knex, + '20260510000000_search_indices_and_dedup.js', + ); + + await knex('refresh_state').insert({ + entity_id: 'e1', + entity_ref: 'k:ns/n1', + unprocessed_entity: '{}', + errors: '[]', + next_update_at: knex.fn.now(), + last_discovery_at: knex.fn.now(), + }); + await knex('final_entities').insert({ + entity_id: 'e1', + entity_ref: 'k:ns/n1', + hash: 'h1', + final_entity: '{}', + }); + + // Insert clean rows (no duplicates) so the unique index can be created + await knex('search').insert([ + { entity_id: 'e1', key: 'k1', value: 'v1', original_value: 'V1' }, + { entity_id: 'e1', key: 'k2', value: null, original_value: null }, + ]); + + // Simulate a user who manually deduped and created the unique index + // before deploying this version + await knex.raw( + `CREATE UNIQUE INDEX search_entity_key_value_idx ON search (entity_id, key, value)`, + ); + + // Migration detects the existing unique index and skips the dedup step + await migrateUpOnce(knex); + + // Row count unchanged — no rows were removed by dedup + const rows = await knex('search').orderBy('key'); + expect(rows).toHaveLength(2); + expect(rows[0]).toEqual( + expect.objectContaining({ + key: 'k1', + value: 'v1', + original_value: 'V1', + }), + ); + expect(rows[1]).toEqual( + expect.objectContaining({ + key: 'k2', + value: null, + original_value: null, + }), + ); + + await migrateDownOnce(knex); + await knex.destroy(); + }, + ); + it.each(databases.eachSupportedId())( '20260403000000_add_location_entity_ref.js, %p', async databaseId => { From cdf38bd8db86e24ac649bc45d580c15d090fbf12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 20:05:47 +0200 Subject: [PATCH 14/16] catalog-backend: fix syncSearchRows test to use real call path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the raw Knex onConflict().merge() insert in the 'overwrites original_value on conflict' test with a direct UPDATE to corrupt the stored original_value, followed by a second syncSearchRows call. This tests the actual application code path (ON CONFLICT DO UPDATE inside syncSearchRows) rather than embedding the conflict logic in the test itself. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../stitcher/syncSearchRows.test.ts | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts index 9cc2af8206..6981af177b 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts @@ -245,21 +245,24 @@ describe.each(databases.eachSupportedId())('syncSearchRows, %p', databaseId => { ); }); - it('overwrites original_value on conflict via ON CONFLICT DO UPDATE', async () => { + it('restores original_value when re-syncing after it was corrupted', async () => { await syncSearchRows(knex, 'e1', [row('a', 'x', 'X')]); - // Simulate a concurrent stitcher inserting the same (entity_id, key, value) - // with a different original_value casing. Unlike DO NOTHING, DO UPDATE - // requires an explicit conflict target — the column list is not optional. - await knex('search') - .insert({ entity_id: 'e1', key: 'a', value: 'x', original_value: 'x' }) - .onConflict(['entity_id', 'key', 'value']) - .merge(['original_value']); + // Corrupt the stored original_value (simulates stale or wrong data left + // by a previous stitcher run) without changing the key or value. + await knex('search') + .where({ entity_id: 'e1', key: 'a', value: 'x' }) + .update({ original_value: 'corrupted' }); + + // Re-syncing the same desired rows should overwrite original_value back to + // 'X' via the ON CONFLICT DO UPDATE SET original_value = EXCLUDED.original_value + // clause inside syncSearchRows. + await syncSearchRows(knex, 'e1', [row('a', 'x', 'X')]); const rows = await getSearchRows(); expect(rows).toHaveLength(1); expect(rows[0]).toEqual( - expect.objectContaining({ key: 'a', value: 'x', original_value: 'x' }), + expect.objectContaining({ key: 'a', value: 'x', original_value: 'X' }), ); }); From 50b5cbd226e7dc1a49867136b16fe24364476759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Mon, 11 May 2026 20:20:09 +0200 Subject: [PATCH 15/16] catalog-backend: remove redundant direct-duplicate-insert test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'silently rejects a direct duplicate insert' test inserted a row via raw Knex onConflict().ignore(), which has no corresponding production code path (syncSearchRows uses ON CONFLICT DO UPDATE, not DO NOTHING). The idempotency behaviour it was intended to verify is already covered by 'leaves unchanged rows untouched'. Signed-off-by: Fredrik Adelöw Co-authored-by: Cursor --- .../stitcher/syncSearchRows.test.ts | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts index 6981af177b..5a46608e7d 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.test.ts @@ -225,26 +225,6 @@ describe.each(databases.eachSupportedId())('syncSearchRows, %p', databaseId => { expect(rows.map(r => r.value).sort()).toEqual(['java', 'python', 'rust']); }); - it('silently rejects a direct duplicate insert via the UNIQUE constraint', async () => { - await syncSearchRows(knex, 'e1', [row('a', 'x'), row('b', 'y')]); - - // Simulate a concurrent process inserting a duplicate directly — - // the UNIQUE constraint on (entity_id, key, value) should reject it - // silently. ON CONFLICT without a column list covers any unique - // constraint on the table, which is safe since (entity_id, key, value) - // is the only one. - await knex('search') - .insert({ entity_id: 'e1', key: 'a', value: 'x', original_value: 'x' }) - .onConflict() - .ignore(); - - const rows = await getSearchRows(); - expect(rows).toHaveLength(2); - expect(rows.find(r => r.key === 'a')).toEqual( - expect.objectContaining({ key: 'a', value: 'x' }), - ); - }); - it('restores original_value when re-syncing after it was corrupted', async () => { await syncSearchRows(knex, 'e1', [row('a', 'x', 'X')]); From 0305dc2c1ecf103b1160a139cb685d3f5b2aedc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Adel=C3=B6w?= Date: Tue, 12 May 2026 09:19:23 +0200 Subject: [PATCH 16/16] fix(catalog-backend): use shared NULL_SENTINEL constant in dedup keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move NULL_SENTINEL to util.ts and import it in buildEntitySearch and syncSearchRows instead of hardcoding '\x01'. Keeps the dedup keys consistent with filterSentinelValues and the SQL COALESCE(…, chr(1)) logic, avoiding drift if the sentinel ever changes. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Fredrik Adelöw --- .../operations/stitcher/buildEntitySearch.ts | 3 ++- .../database/operations/stitcher/syncSearchRows.ts | 14 ++++---------- .../src/database/operations/stitcher/util.ts | 6 ++++++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts index bbee57c84f..341420655d 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/buildEntitySearch.ts @@ -17,6 +17,7 @@ import { DEFAULT_NAMESPACE, Entity } from '@backstage/catalog-model'; import { InputError } from '@backstage/errors'; import { DbSearchRow } from '../../tables'; +import { NULL_SENTINEL } from './util'; // These are excluded in the generic loop, either because they do not make sense // to index, or because they are special-case always inserted whether they are @@ -235,7 +236,7 @@ export function buildEntitySearch( // violate the unique constraint on (entity_id, key, value). const seen = new Set(); return rows.filter(row => { - const k = `${row.key}\0${row.value === null ? '\x01' : row.value}`; + const k = `${row.key}\0${row.value === null ? NULL_SENTINEL : row.value}`; if (seen.has(k)) return false; seen.add(k); return true; diff --git a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts index dce65b4109..1dde84c017 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/syncSearchRows.ts @@ -16,15 +16,7 @@ import { Knex } from 'knex'; import { DbSearchRow } from '../../tables'; -import { BATCH_SIZE } from './util'; - -// The Postgres sync uses COALESCE(x, NULL_SENTINEL) to allow Postgres to -// include nullable columns in the Hash Cond of anti-joins (IS NOT DISTINCT -// FROM prevents this). As a consequence, values that are exactly this -// sentinel character are not searchable — they would be treated as NULL. -// This is the SOH (Start of Heading) control character which does not -// appear in real entity metadata. -const NULL_SENTINEL = '\x01'; +import { BATCH_SIZE, NULL_SENTINEL } from './util'; function filterSentinelValues(entries: DbSearchRow[]): DbSearchRow[] { return entries.filter( @@ -55,7 +47,9 @@ export async function syncSearchRows( // consistently pick the same original_value for a given input order. const dedupMap = new Map(); for (const entry of searchEntries) { - const k = `${entry.key}\0${entry.value === null ? '\x01' : entry.value}`; + const k = `${entry.key}\0${ + entry.value === null ? NULL_SENTINEL : entry.value + }`; if (!dedupMap.has(k)) { dedupMap.set(k, entry); } diff --git a/plugins/catalog-backend/src/database/operations/stitcher/util.ts b/plugins/catalog-backend/src/database/operations/stitcher/util.ts index 4a2dabcfa6..bac099514a 100644 --- a/plugins/catalog-backend/src/database/operations/stitcher/util.ts +++ b/plugins/catalog-backend/src/database/operations/stitcher/util.ts @@ -24,6 +24,12 @@ import stableStringify from 'fast-json-stable-stringify'; // enough to get the speed benefits. export const BATCH_SIZE = 50; +// The SOH (Start of Heading) control character, used as a stand-in for NULL +// in contexts where NULL cannot participate in equality comparisons (SQL +// COALESCE, JS dedup keys). It cannot appear in real entity metadata values +// since they are human-readable strings. +export const NULL_SENTINEL = '\x01'; + export function generateStableHash(entity: Entity) { return createHash('sha1') .update(stableStringify({ ...entity }))