Changing tokenizer seperator for seach indexing
Signed-off-by: Jordan Snow <jordans@spotify.com>
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
---
|
||||
'@backstage/plugin-search-backend-node': minor
|
||||
---
|
||||
|
||||
Improving method that search tokenizer breaks apart entity names
|
||||
@@ -106,4 +106,26 @@ describe('LunrSearchEngineIndexer', () => {
|
||||
...[lunr.trimmer, lunr.stopWordFilter, lunr.stemmer],
|
||||
);
|
||||
});
|
||||
|
||||
it('should tokenize input on non-alphanumeric characters', () => {
|
||||
const input =
|
||||
'Tokenize_test string, entity-name. Unicode support also!三 stjärna عربي';
|
||||
const expectedTokens = [
|
||||
'tokenize',
|
||||
'test',
|
||||
'string',
|
||||
'entity',
|
||||
'name',
|
||||
'unicode',
|
||||
'support',
|
||||
'also',
|
||||
'三',
|
||||
'stjärna',
|
||||
'عربي',
|
||||
];
|
||||
|
||||
const tokens = lunr.tokenizer(input).map(token => token.toString());
|
||||
|
||||
expect(tokens).toEqual(expectedTokens);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -29,8 +29,8 @@ export class LunrSearchEngineIndexer extends BatchSearchEngineIndexer {
|
||||
|
||||
constructor() {
|
||||
super({ batchSize: 1000 });
|
||||
|
||||
this.builder = new lunr.Builder();
|
||||
this.builder.tokenizer.separator = /[^\p{L}\p{N}]+/u;
|
||||
this.builder.pipeline.add(lunr.trimmer, lunr.stopWordFilter, lunr.stemmer);
|
||||
this.builder.searchPipeline.add(lunr.stemmer);
|
||||
this.builder.metadataWhitelist = ['position'];
|
||||
|
||||
Reference in New Issue
Block a user