Provide access to new settings for HyphenationCompoundWordTokenFilter (#115585)

Allow the new flags added in Lucene in the HyphenationCompoundWordTokenFilter Adds access to the two new flags no_sub_matches and no_overlapping_matches. Lucene issue: https://github.com/apache/lucene/issues/9231
2025-06-28 17:34:17 -04:00 · 2024-11-18 17:38:49 +01:00 · 2024-11-18 17:38:49 +01:00 · c804953105
commit c804953105
parent 99689281e0
7 changed files with 1295 additions and 11 deletions
--- a/docs/changelog/115585.yaml
+++ b/docs/changelog/115585.yaml
@ -0,0 +1,6 @@
 pr: 115459
 summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
 area: Search
 type: enhancement
 issues:
  - 97849
--- a/docs/reference/analysis/tokenfilters/hyphenation-decompounder-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/hyphenation-decompounder-tokenfilter.asciidoc
@ -111,6 +111,18 @@ output. Defaults to `5`.
 (Optional, Boolean)
 If `true`, only include the longest matching subword. Defaults to `false`.
 `no_sub_matches`::
 (Optional, Boolean)
 If `true`, do not match sub tokens in tokens that are in the word list.
 Defaults to `false`.
 `no_overlapping_matches`::
 (Optional, Boolean)
 If `true`, do not allow overlapping tokens.
 Defaults to `false`.
 Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.
 [[analysis-hyp-decomp-tokenfilter-customize]]
 ==== Customize and add to an analyzer
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java
@ -28,6 +28,8 @@ import java.nio.file.Path;
 */
 public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
    private final boolean noSubMatches;
    private final boolean noOverlappingMatches;
    private final HyphenationTree hyphenationTree;
    HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
        } catch (Exception e) {
            throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
        }
        noSubMatches = settings.getAsBoolean("no_sub_matches", false);
        noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
    }
    @Override
@ -57,7 +62,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
            minWordSize,
            minSubwordSize,
            maxSubwordSize,
-            onlyLongestMatch
+            onlyLongestMatch,
            noSubMatches,
            noOverlappingMatches
        );
    }
 }
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CompoundAnalysisTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CompoundAnalysisTests.java
@ -31,6 +31,9 @@ import org.elasticsearch.test.IndexSettingsModule;
 import org.hamcrest.MatcherAssert;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@ -42,6 +45,7 @@ import static org.hamcrest.Matchers.hasItems;
 import static org.hamcrest.Matchers.instanceOf;
 public class CompoundAnalysisTests extends ESTestCase {
    public void testDefaultsCompoundAnalysis() throws Exception {
        Settings settings = getJsonSettings();
        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
@ -63,6 +67,44 @@ public class CompoundAnalysisTests extends ESTestCase {
        assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
    }
    public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
        Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
        for (Settings settings : settingsArr) {
            List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
            MatcherAssert.assertThat(
                terms,
                hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
            );
        }
        assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
    }
    /**
     * For example given a word list of: ["kaffee", "fee", "maschine"]
     * no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
     */
    public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
        Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
        for (Settings settings : settingsArr) {
            List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
            MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
        }
        assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
    }
    /**
     * For example given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
     * no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
     */
    public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
        Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
        for (Settings settings : settingsArr) {
            List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
            MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
        }
        assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
    }
    private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
        AnalysisModule analysisModule = createAnalysisModule(settings);
@ -92,20 +134,25 @@ public class CompoundAnalysisTests extends ESTestCase {
    }
    private Settings getJsonSettings() throws IOException {
-        String json = "/org/elasticsearch/analysis/common/test1.json";
+        return getSettings("/org/elasticsearch/analysis/common/test1.json");
        return Settings.builder()
            .loadFromStream(json, getClass().getResourceAsStream(json), false)
            .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
    }
    private Settings getYamlSettings() throws IOException {
-        String yaml = "/org/elasticsearch/analysis/common/test1.yml";
+        return getSettings("/org/elasticsearch/analysis/common/test1.yml");
    }
    private Settings getSettings(String filePath) throws IOException {
        String hypenationRulesFileName = "de_DR.xml";
        InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
        Path home = createTempDir();
        Path config = home.resolve("config");
        Files.createDirectory(config);
        Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));
        return Settings.builder()
-            .loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
+            .loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
            .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
-            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
            .build();
    }
 }
--- a/modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/de_DR.xml
+++ b/modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/de_DR.xml
--- a/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json
+++ b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json
@ -21,6 +21,51 @@
                "dict_dec":{
                    "type":"dictionary_decompounder",
                    "word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
                },
              "hyphenation_dec_only_longest_match": {
                "type": "hyphenation_decompounder",
                "hyphenation_patterns_path": "de_DR.xml",
                "word_list": [
                  "fuss",
                  "fussball",
                  "ballpumpe",
                  "ball",
                  "pumpe",
                  "kaffee",
                  "fee",
                  "maschine"
                ],
                "only_longest_match": true
              },
              "hyphenation_dec_no_sub_matches": {
                "type": "hyphenation_decompounder",
                "hyphenation_patterns_path": "de_DR.xml",
                "word_list": [
                  "fuss",
                  "fussball",
                  "ballpumpe",
                  "ball",
                  "pumpe",
                  "kaffee",
                  "fee",
                  "maschine"
                ],
                "no_sub_matches": true
              },
              "hyphenation_dec_no_overlapping_matches": {
                "type": "hyphenation_decompounder",
                "hyphenation_patterns_path": "de_DR.xml",
                "word_list": [
                  "fuss",
                  "fussball",
                  "ballpumpe",
                  "ball",
                  "pumpe",
                  "kaffee",
                  "fee",
                  "maschine"
                ],
                "no_overlapping_matches": true
              }
            },
            "analyzer":{
@ -47,6 +92,18 @@
                "decompoundingAnalyzer":{
                    "tokenizer":"standard",
                    "filter":["dict_dec"]
                },
                "hyphenationDecompoundingAnalyzerOnlyLongestMatch":{
                    "tokenizer":"standard",
                    "filter":["hyphenation_dec_only_longest_match"]
                },
                "hyphenationDecompoundingAnalyzerNoSubMatches": {
                    "tokenizer":"standard",
                    "filter":["hyphenation_dec_no_sub_matches"]
                },
                "hyphenationDecompoundingAnalyzerNoOverlappingMatches":{
                    "tokenizer":"standard",
                    "filter":["hyphenation_dec_no_overlapping_matches"]
                }
            }
        }
--- a/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.yml
+++ b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.yml
@ -15,6 +15,21 @@ index :
      dict_dec :
        type : dictionary_decompounder
        word_list : [donau, dampf, schiff, spargel, creme, suppe]
      hyphenation_dec_only_longest_match :
        type : hyphenation_decompounder
        hyphenation_patterns_path : de_DR.xml
        word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
        only_longest_match : true
      hyphenation_dec_no_sub_matches :
        type : hyphenation_decompounder
        hyphenation_patterns_path : de_DR.xml
        word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
        no_sub_matches : true
      hyphenation_dec_no_overlapping_matches :
        type : hyphenation_decompounder
        hyphenation_patterns_path : de_DR.xml
        word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
        no_overlapping_matches: true
    analyzer :
      standard :
        type : standard
@ -37,3 +52,13 @@ index :
      decompoundingAnalyzer :
        tokenizer : standard
        filter : [dict_dec]
      hyphenationDecompoundingAnalyzerOnlyLongestMatch :
        tokenizer : standard
        filter : [hyphenation_dec_only_longest_match]
      hyphenationDecompoundingAnalyzerNoSubMatches:
        tokenizer: standard
        filter : [hyphenation_dec_no_sub_matches]
      hyphenationDecompoundingAnalyzerNoOverlappingMatches:
        tokenizer: standard
        filter : [hyphenation_dec_no_overlapping_matches]