diff --git a/.changeset/chilly-hotels-walk.md b/.changeset/chilly-hotels-walk.md new file mode 100644 index 0000000000..e9b1ff66ba --- /dev/null +++ b/.changeset/chilly-hotels-walk.md @@ -0,0 +1,5 @@ +--- +'@backstage/plugin-search-backend-node': minor +--- + +Improving method that search tokenizer breaks apart entity names diff --git a/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.test.ts b/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.test.ts index ff76f0fa7c..4eb7ef9227 100644 --- a/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.test.ts +++ b/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.test.ts @@ -106,4 +106,26 @@ describe('LunrSearchEngineIndexer', () => { ...[lunr.trimmer, lunr.stopWordFilter, lunr.stemmer], ); }); + + it('should tokenize input on non-alphanumeric characters', () => { + const input = + 'Tokenize_test string, entity-name. Unicode support also!三 stjärna عربي'; + const expectedTokens = [ + 'tokenize', + 'test', + 'string', + 'entity', + 'name', + 'unicode', + 'support', + 'also', + '三', + 'stjärna', + 'عربي', + ]; + + const tokens = lunr.tokenizer(input).map(token => token.toString()); + + expect(tokens).toEqual(expectedTokens); + }); }); diff --git a/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.ts b/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.ts index e723957077..ac7f7b1c0d 100644 --- a/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.ts +++ b/plugins/search-backend-node/src/engines/LunrSearchEngineIndexer.ts @@ -29,8 +29,8 @@ export class LunrSearchEngineIndexer extends BatchSearchEngineIndexer { constructor() { super({ batchSize: 1000 }); - this.builder = new lunr.Builder(); + this.builder.tokenizer.separator = /[^\p{L}\p{N}]+/u; this.builder.pipeline.add(lunr.trimmer, lunr.stopWordFilter, lunr.stemmer); this.builder.searchPipeline.add(lunr.stemmer); this.builder.metadataWhitelist = ['position'];