Provide access to new settings for HyphenationCompoundWordTokenFilter (#115585)

Allow the new flags added in Lucene in the HyphenationCompoundWordTokenFilter

Adds access to the two new flags no_sub_matches and no_overlapping_matches.

Lucene issue: https://github.com/apache/lucene/issues/9231
This commit is contained in:
Peter Straßer 2024-11-18 17:38:49 +01:00 committed by GitHub
parent 99689281e0
commit c804953105
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 1295 additions and 11 deletions

View file

@ -0,0 +1,6 @@
pr: 115459
summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
area: Search
type: enhancement
issues:
- 97849

View file

@ -111,6 +111,18 @@ output. Defaults to `5`.
(Optional, Boolean) (Optional, Boolean)
If `true`, only include the longest matching subword. Defaults to `false`. If `true`, only include the longest matching subword. Defaults to `false`.
`no_sub_matches`::
(Optional, Boolean)
If `true`, do not match sub tokens in tokens that are in the word list.
Defaults to `false`.
`no_overlapping_matches`::
(Optional, Boolean)
If `true`, do not allow overlapping tokens.
Defaults to `false`.
Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.
[[analysis-hyp-decomp-tokenfilter-customize]] [[analysis-hyp-decomp-tokenfilter-customize]]
==== Customize and add to an analyzer ==== Customize and add to an analyzer

View file

@ -28,6 +28,8 @@ import java.nio.file.Path;
*/ */
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory { public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
private final boolean noSubMatches;
private final boolean noOverlappingMatches;
private final HyphenationTree hyphenationTree; private final HyphenationTree hyphenationTree;
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
} catch (Exception e) { } catch (Exception e) {
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e); throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
} }
noSubMatches = settings.getAsBoolean("no_sub_matches", false);
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
} }
@Override @Override
@ -57,7 +62,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
minWordSize, minWordSize,
minSubwordSize, minSubwordSize,
maxSubwordSize, maxSubwordSize,
onlyLongestMatch onlyLongestMatch,
noSubMatches,
noOverlappingMatches
); );
} }
} }

View file

@ -31,6 +31,9 @@ import org.elasticsearch.test.IndexSettingsModule;
import org.hamcrest.MatcherAssert; import org.hamcrest.MatcherAssert;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@ -42,6 +45,7 @@ import static org.hamcrest.Matchers.hasItems;
import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.instanceOf;
public class CompoundAnalysisTests extends ESTestCase { public class CompoundAnalysisTests extends ESTestCase {
public void testDefaultsCompoundAnalysis() throws Exception { public void testDefaultsCompoundAnalysis() throws Exception {
Settings settings = getJsonSettings(); Settings settings = getJsonSettings();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
@ -63,6 +67,44 @@ public class CompoundAnalysisTests extends ESTestCase {
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated"); assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
} }
public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(
terms,
hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
);
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}
/**
* For example given a word list of: ["kaffee", "fee", "maschine"]
* no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
*/
public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}
/**
* For example given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
* no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
*/
public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings); AnalysisModule analysisModule = createAnalysisModule(settings);
@ -92,20 +134,25 @@ public class CompoundAnalysisTests extends ESTestCase {
} }
private Settings getJsonSettings() throws IOException { private Settings getJsonSettings() throws IOException {
String json = "/org/elasticsearch/analysis/common/test1.json"; return getSettings("/org/elasticsearch/analysis/common/test1.json");
return Settings.builder()
.loadFromStream(json, getClass().getResourceAsStream(json), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
} }
private Settings getYamlSettings() throws IOException { private Settings getYamlSettings() throws IOException {
String yaml = "/org/elasticsearch/analysis/common/test1.yml"; return getSettings("/org/elasticsearch/analysis/common/test1.yml");
}
private Settings getSettings(String filePath) throws IOException {
String hypenationRulesFileName = "de_DR.xml";
InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
Path home = createTempDir();
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));
return Settings.builder() return Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false) .loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build(); .build();
} }
} }

View file

@ -21,7 +21,52 @@
"dict_dec":{ "dict_dec":{
"type":"dictionary_decompounder", "type":"dictionary_decompounder",
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"] "word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
} },
"hyphenation_dec_only_longest_match": {
"type": "hyphenation_decompounder",
"hyphenation_patterns_path": "de_DR.xml",
"word_list": [
"fuss",
"fussball",
"ballpumpe",
"ball",
"pumpe",
"kaffee",
"fee",
"maschine"
],
"only_longest_match": true
},
"hyphenation_dec_no_sub_matches": {
"type": "hyphenation_decompounder",
"hyphenation_patterns_path": "de_DR.xml",
"word_list": [
"fuss",
"fussball",
"ballpumpe",
"ball",
"pumpe",
"kaffee",
"fee",
"maschine"
],
"no_sub_matches": true
},
"hyphenation_dec_no_overlapping_matches": {
"type": "hyphenation_decompounder",
"hyphenation_patterns_path": "de_DR.xml",
"word_list": [
"fuss",
"fussball",
"ballpumpe",
"ball",
"pumpe",
"kaffee",
"fee",
"maschine"
],
"no_overlapping_matches": true
}
}, },
"analyzer":{ "analyzer":{
"standard":{ "standard":{
@ -47,6 +92,18 @@
"decompoundingAnalyzer":{ "decompoundingAnalyzer":{
"tokenizer":"standard", "tokenizer":"standard",
"filter":["dict_dec"] "filter":["dict_dec"]
},
"hyphenationDecompoundingAnalyzerOnlyLongestMatch":{
"tokenizer":"standard",
"filter":["hyphenation_dec_only_longest_match"]
},
"hyphenationDecompoundingAnalyzerNoSubMatches": {
"tokenizer":"standard",
"filter":["hyphenation_dec_no_sub_matches"]
},
"hyphenationDecompoundingAnalyzerNoOverlappingMatches":{
"tokenizer":"standard",
"filter":["hyphenation_dec_no_overlapping_matches"]
} }
} }
} }

View file

@ -15,6 +15,21 @@ index :
dict_dec : dict_dec :
type : dictionary_decompounder type : dictionary_decompounder
word_list : [donau, dampf, schiff, spargel, creme, suppe] word_list : [donau, dampf, schiff, spargel, creme, suppe]
hyphenation_dec_only_longest_match :
type : hyphenation_decompounder
hyphenation_patterns_path : de_DR.xml
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
only_longest_match : true
hyphenation_dec_no_sub_matches :
type : hyphenation_decompounder
hyphenation_patterns_path : de_DR.xml
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
no_sub_matches : true
hyphenation_dec_no_overlapping_matches :
type : hyphenation_decompounder
hyphenation_patterns_path : de_DR.xml
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
no_overlapping_matches: true
analyzer : analyzer :
standard : standard :
type : standard type : standard
@ -37,3 +52,13 @@ index :
decompoundingAnalyzer : decompoundingAnalyzer :
tokenizer : standard tokenizer : standard
filter : [dict_dec] filter : [dict_dec]
hyphenationDecompoundingAnalyzerOnlyLongestMatch :
tokenizer : standard
filter : [hyphenation_dec_only_longest_match]
hyphenationDecompoundingAnalyzerNoSubMatches:
tokenizer: standard
filter : [hyphenation_dec_no_sub_matches]
hyphenationDecompoundingAnalyzerNoOverlappingMatches:
tokenizer: standard
filter : [hyphenation_dec_no_overlapping_matches]