mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-29 01:44:36 -04:00
Add support for pattern replace filter in normalizers (#96588)
This change adds support for `pattern_replace` token filters use in custom normalizers. Closes #83005
This commit is contained in:
parent
4df6911ec7
commit
a8cf4d6006
5 changed files with 90 additions and 2 deletions
5
docs/changelog/96588.yaml
Normal file
5
docs/changelog/96588.yaml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
pr: 96588
|
||||||
|
summary: Support for patter_replace filter in keyword normalizer
|
||||||
|
area: Search
|
||||||
|
type: enhancement
|
||||||
|
issues: []
|
|
@ -9,7 +9,7 @@ allowed, but not a stemming filter, which needs to look at the keyword as a
|
||||||
whole. The current list of filters that can be used in a normalizer is
|
whole. The current list of filters that can be used in a normalizer is
|
||||||
following: `arabic_normalization`, `asciifolding`, `bengali_normalization`,
|
following: `arabic_normalization`, `asciifolding`, `bengali_normalization`,
|
||||||
`cjk_width`, `decimal_digit`, `elision`, `german_normalization`,
|
`cjk_width`, `decimal_digit`, `elision`, `german_normalization`,
|
||||||
`hindi_normalization`, `indic_normalization`, `lowercase`,
|
`hindi_normalization`, `indic_normalization`, `lowercase`, `pattern_replace`,
|
||||||
`persian_normalization`, `scandinavian_folding`, `serbian_normalization`,
|
`persian_normalization`, `scandinavian_folding`, `serbian_normalization`,
|
||||||
`sorani_normalization`, `uppercase`.
|
`sorani_normalization`, `uppercase`.
|
||||||
|
|
||||||
|
|
|
@ -15,10 +15,11 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
|
||||||
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory {
|
public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
|
||||||
|
|
||||||
private final Pattern pattern;
|
private final Pattern pattern;
|
||||||
private final String replacement;
|
private final String replacement;
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0 and the Server Side Public License, v 1; you may not use this file except
|
||||||
|
* in compliance with, at your election, the Elastic License 2.0 or the Server
|
||||||
|
* Side Public License, v 1.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||||
|
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||||
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class PatternReplaceTokenFilterTests extends ESTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testNormalizer() throws IOException {
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.putList("index.analysis.normalizer.my_normalizer.filter", "replace_zeros")
|
||||||
|
.put("index.analysis.filter.replace_zeros.type", "pattern_replace")
|
||||||
|
.put("index.analysis.filter.replace_zeros.pattern", "0+")
|
||||||
|
.put("index.analysis.filter.replace_zeros.replacement", "")
|
||||||
|
.put("index.analysis.filter.replace_zeros.all", true)
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.build();
|
||||||
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||||
|
assertNull(analysis.indexAnalyzers.get("my_normalizer"));
|
||||||
|
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
|
||||||
|
assertNotNull(normalizer);
|
||||||
|
assertEquals("my_normalizer", normalizer.name());
|
||||||
|
assertTokenStreamContents(normalizer.tokenStream("foo", "0000111"), new String[] { "111" });
|
||||||
|
assertEquals(new BytesRef("111"), normalizer.normalize("foo", "0000111"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1683,3 +1683,44 @@
|
||||||
- length: { tokens: 6 }
|
- length: { tokens: 6 }
|
||||||
- match: { tokens.0.token: the }
|
- match: { tokens.0.token: the }
|
||||||
- match: { tokens.1.token: THE }
|
- match: { tokens.1.token: THE }
|
||||||
|
|
||||||
|
---
|
||||||
|
"pattern_replace_filter":
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
analysis:
|
||||||
|
normalizer:
|
||||||
|
my_normalizer:
|
||||||
|
type: custom
|
||||||
|
filter: ["replace_zeros"]
|
||||||
|
filter:
|
||||||
|
replace_zeros:
|
||||||
|
type: pattern_replace
|
||||||
|
pattern: "0+"
|
||||||
|
replacement: ""
|
||||||
|
all: true
|
||||||
|
mappings:
|
||||||
|
properties:
|
||||||
|
pagerank:
|
||||||
|
type: keyword
|
||||||
|
normalizer: my_normalizer
|
||||||
|
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: test
|
||||||
|
id: "1"
|
||||||
|
body: { pagerank: "000000111"}
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.refresh:
|
||||||
|
index: [ test ]
|
||||||
|
|
||||||
|
- do:
|
||||||
|
search:
|
||||||
|
index: test
|
||||||
|
q: pagerank:111
|
||||||
|
|
||||||
|
- match: {hits.total.value: 1}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue