Add support for pattern replace filter in normalizers (#96588)

This change adds support for `pattern_replace` token filters use in custom normalizers. Closes #83005
2025-06-29 01:44:36 -04:00 · 2023-06-10 01:32:39 +03:00 · 2023-06-10 01:32:39 +03:00 · a8cf4d6006
commit a8cf4d6006
parent 4df6911ec7
5 changed files with 90 additions and 2 deletions
--- a/docs/changelog/96588.yaml
+++ b/docs/changelog/96588.yaml
@ -0,0 +1,5 @@
 pr: 96588
 summary: Support for patter_replace filter in keyword normalizer
 area: Search
 type: enhancement
 issues: []
--- a/docs/reference/analysis/normalizers.asciidoc
+++ b/docs/reference/analysis/normalizers.asciidoc
@ -9,7 +9,7 @@ allowed, but not a stemming filter, which needs to look at the keyword as a
 whole. The current list of filters that can be used in a normalizer is
 following: `arabic_normalization`, `asciifolding`, `bengali_normalization`,
 `cjk_width`, `decimal_digit`, `elision`, `german_normalization`,
-`hindi_normalization`, `indic_normalization`, `lowercase`,
+`hindi_normalization`, `indic_normalization`, `lowercase`, `pattern_replace`,
 `persian_normalization`, `scandinavian_folding`, `serbian_normalization`,
 `sorani_normalization`, `uppercase`.
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PatternReplaceTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PatternReplaceTokenFilterFactory.java
@ -15,10 +15,11 @@ import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
 import java.util.regex.Pattern;
-public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory {
+public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
    private final Pattern pattern;
    private final String replacement;
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PatternReplaceTokenFilterTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PatternReplaceTokenFilterTests.java
@ -0,0 +1,41 @@
 /*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
 package org.elasticsearch.analysis.common;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.analysis.AnalysisTestsHelper;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 import java.io.IOException;
 public class PatternReplaceTokenFilterTests extends ESTokenStreamTestCase {
    public void testNormalizer() throws IOException {
        Settings settings = Settings.builder()
            .putList("index.analysis.normalizer.my_normalizer.filter", "replace_zeros")
            .put("index.analysis.filter.replace_zeros.type", "pattern_replace")
            .put("index.analysis.filter.replace_zeros.pattern", "0+")
            .put("index.analysis.filter.replace_zeros.replacement", "")
            .put("index.analysis.filter.replace_zeros.all", true)
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
        assertNull(analysis.indexAnalyzers.get("my_normalizer"));
        NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
        assertNotNull(normalizer);
        assertEquals("my_normalizer", normalizer.name());
        assertTokenStreamContents(normalizer.tokenStream("foo", "0000111"), new String[] { "111" });
        assertEquals(new BytesRef("111"), normalizer.normalize("foo", "0000111"));
    }
 }
--- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
+++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@ -1683,3 +1683,44 @@
    - length: { tokens: 6 }
    - match: { tokens.0.token: the }
    - match: { tokens.1.token: THE }
 ---
 "pattern_replace_filter":
  - do:
      indices.create:
        index: test
        body:
          settings:
            analysis:
              normalizer:
                my_normalizer:
                  type: custom
                  filter: ["replace_zeros"]
              filter:
                replace_zeros:
                  type: pattern_replace
                  pattern: "0+"
                  replacement: ""
                  all: true
          mappings:
            properties:
              pagerank:
                type: keyword
                normalizer: my_normalizer
  - do:
      index:
        index:  test
        id:     "1"
        body:   { pagerank: "000000111"}
  - do:
      indices.refresh:
        index: [ test ]
  - do:
      search:
        index: test
        q: pagerank:111
  - match: {hits.total.value: 1}