Changing tokenizer seperator for seach indexing

Signed-off-by: Jordan Snow <jordans@spotify.com>
This commit is contained in:
Jordan Snow
2025-11-20 16:23:32 -05:00
parent dd54fffedc
commit 4d3ddb99b6
3 changed files with 28 additions and 1 deletions
+5
View File
@@ -0,0 +1,5 @@
---
'@backstage/plugin-search-backend-node': minor
---
Improving method that search tokenizer breaks apart entity names
@@ -106,4 +106,26 @@ describe('LunrSearchEngineIndexer', () => {
...[lunr.trimmer, lunr.stopWordFilter, lunr.stemmer],
);
});
it('should tokenize input on non-alphanumeric characters', () => {
const input =
'Tokenize_test string, entity-name. Unicode support also!三 stjärna عربي';
const expectedTokens = [
'tokenize',
'test',
'string',
'entity',
'name',
'unicode',
'support',
'also',
'三',
'stjärna',
'عربي',
];
const tokens = lunr.tokenizer(input).map(token => token.toString());
expect(tokens).toEqual(expectedTokens);
});
});
@@ -29,8 +29,8 @@ export class LunrSearchEngineIndexer extends BatchSearchEngineIndexer {
constructor() {
super({ batchSize: 1000 });
this.builder = new lunr.Builder();
this.builder.tokenizer.separator = /[^\p{L}\p{N}]+/u;
this.builder.pipeline.add(lunr.trimmer, lunr.stopWordFilter, lunr.stemmer);
this.builder.searchPipeline.add(lunr.stemmer);
this.builder.metadataWhitelist = ['position'];