Add support for pattern replace filter in normalizers (#96588)

This change adds support for `pattern_replace` token filters use in custom normalizers. 

Closes #83005
This commit is contained in:
Marantidis Kiriakos 2023-06-10 01:32:39 +03:00 committed by GitHub
parent 4df6911ec7
commit a8cf4d6006
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 90 additions and 2 deletions

View file

@ -0,0 +1,5 @@
pr: 96588
summary: Support for patter_replace filter in keyword normalizer
area: Search
type: enhancement
issues: []

View file

@ -9,7 +9,7 @@ allowed, but not a stemming filter, which needs to look at the keyword as a
whole. The current list of filters that can be used in a normalizer is whole. The current list of filters that can be used in a normalizer is
following: `arabic_normalization`, `asciifolding`, `bengali_normalization`, following: `arabic_normalization`, `asciifolding`, `bengali_normalization`,
`cjk_width`, `decimal_digit`, `elision`, `german_normalization`, `cjk_width`, `decimal_digit`, `elision`, `german_normalization`,
`hindi_normalization`, `indic_normalization`, `lowercase`, `hindi_normalization`, `indic_normalization`, `lowercase`, `pattern_replace`,
`persian_normalization`, `scandinavian_folding`, `serbian_normalization`, `persian_normalization`, `scandinavian_folding`, `serbian_normalization`,
`sorani_normalization`, `uppercase`. `sorani_normalization`, `uppercase`.

View file

@ -15,10 +15,11 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
import java.util.regex.Pattern; import java.util.regex.Pattern;
public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory { public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
private final Pattern pattern; private final Pattern pattern;
private final String replacement; private final String replacement;

View file

@ -0,0 +1,41 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.analysis.common;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import java.io.IOException;
public class PatternReplaceTokenFilterTests extends ESTokenStreamTestCase {
public void testNormalizer() throws IOException {
Settings settings = Settings.builder()
.putList("index.analysis.normalizer.my_normalizer.filter", "replace_zeros")
.put("index.analysis.filter.replace_zeros.type", "pattern_replace")
.put("index.analysis.filter.replace_zeros.pattern", "0+")
.put("index.analysis.filter.replace_zeros.replacement", "")
.put("index.analysis.filter.replace_zeros.all", true)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
assertNull(analysis.indexAnalyzers.get("my_normalizer"));
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "0000111"), new String[] { "111" });
assertEquals(new BytesRef("111"), normalizer.normalize("foo", "0000111"));
}
}

View file

@ -1683,3 +1683,44 @@
- length: { tokens: 6 } - length: { tokens: 6 }
- match: { tokens.0.token: the } - match: { tokens.0.token: the }
- match: { tokens.1.token: THE } - match: { tokens.1.token: THE }
---
"pattern_replace_filter":
- do:
indices.create:
index: test
body:
settings:
analysis:
normalizer:
my_normalizer:
type: custom
filter: ["replace_zeros"]
filter:
replace_zeros:
type: pattern_replace
pattern: "0+"
replacement: ""
all: true
mappings:
properties:
pagerank:
type: keyword
normalizer: my_normalizer
- do:
index:
index: test
id: "1"
body: { pagerank: "000000111"}
- do:
indices.refresh:
index: [ test ]
- do:
search:
index: test
q: pagerank:111
- match: {hits.total.value: 1}