mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 09:28:55 -04:00
Provide access to new settings for HyphenationCompoundWordTokenFilter (#115585)
Allow the new flags added in Lucene in the HyphenationCompoundWordTokenFilter Adds access to the two new flags no_sub_matches and no_overlapping_matches. Lucene issue: https://github.com/apache/lucene/issues/9231
This commit is contained in:
parent
99689281e0
commit
c804953105
7 changed files with 1295 additions and 11 deletions
6
docs/changelog/115585.yaml
Normal file
6
docs/changelog/115585.yaml
Normal file
|
@ -0,0 +1,6 @@
|
|||
pr: 115459
|
||||
summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
|
||||
area: Search
|
||||
type: enhancement
|
||||
issues:
|
||||
- 97849
|
|
@ -111,6 +111,18 @@ output. Defaults to `5`.
|
|||
(Optional, Boolean)
|
||||
If `true`, only include the longest matching subword. Defaults to `false`.
|
||||
|
||||
`no_sub_matches`::
|
||||
(Optional, Boolean)
|
||||
If `true`, do not match sub tokens in tokens that are in the word list.
|
||||
Defaults to `false`.
|
||||
|
||||
`no_overlapping_matches`::
|
||||
(Optional, Boolean)
|
||||
If `true`, do not allow overlapping tokens.
|
||||
Defaults to `false`.
|
||||
|
||||
Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.
|
||||
|
||||
[[analysis-hyp-decomp-tokenfilter-customize]]
|
||||
==== Customize and add to an analyzer
|
||||
|
||||
|
|
|
@ -28,6 +28,8 @@ import java.nio.file.Path;
|
|||
*/
|
||||
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
|
||||
|
||||
private final boolean noSubMatches;
|
||||
private final boolean noOverlappingMatches;
|
||||
private final HyphenationTree hyphenationTree;
|
||||
|
||||
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
|
@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
|||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
|
||||
}
|
||||
|
||||
noSubMatches = settings.getAsBoolean("no_sub_matches", false);
|
||||
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -57,7 +62,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
|||
minWordSize,
|
||||
minSubwordSize,
|
||||
maxSubwordSize,
|
||||
onlyLongestMatch
|
||||
onlyLongestMatch,
|
||||
noSubMatches,
|
||||
noOverlappingMatches
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,6 +31,9 @@ import org.elasticsearch.test.IndexSettingsModule;
|
|||
import org.hamcrest.MatcherAssert;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -42,6 +45,7 @@ import static org.hamcrest.Matchers.hasItems;
|
|||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
public class CompoundAnalysisTests extends ESTestCase {
|
||||
|
||||
public void testDefaultsCompoundAnalysis() throws Exception {
|
||||
Settings settings = getJsonSettings();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||
|
@ -63,6 +67,44 @@ public class CompoundAnalysisTests extends ESTestCase {
|
|||
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||
}
|
||||
|
||||
public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
|
||||
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||
for (Settings settings : settingsArr) {
|
||||
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
|
||||
MatcherAssert.assertThat(
|
||||
terms,
|
||||
hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
|
||||
);
|
||||
}
|
||||
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||
}
|
||||
|
||||
/**
|
||||
* For example given a word list of: ["kaffee", "fee", "maschine"]
|
||||
* no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
|
||||
*/
|
||||
public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
|
||||
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||
for (Settings settings : settingsArr) {
|
||||
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
|
||||
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
|
||||
}
|
||||
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||
}
|
||||
|
||||
/**
|
||||
* For example given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
|
||||
* no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
|
||||
*/
|
||||
public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
|
||||
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||
for (Settings settings : settingsArr) {
|
||||
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
|
||||
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
|
||||
}
|
||||
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||
}
|
||||
|
||||
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||
AnalysisModule analysisModule = createAnalysisModule(settings);
|
||||
|
@ -92,20 +134,25 @@ public class CompoundAnalysisTests extends ESTestCase {
|
|||
}
|
||||
|
||||
private Settings getJsonSettings() throws IOException {
|
||||
String json = "/org/elasticsearch/analysis/common/test1.json";
|
||||
return Settings.builder()
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
return getSettings("/org/elasticsearch/analysis/common/test1.json");
|
||||
}
|
||||
|
||||
private Settings getYamlSettings() throws IOException {
|
||||
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
|
||||
return getSettings("/org/elasticsearch/analysis/common/test1.yml");
|
||||
}
|
||||
|
||||
private Settings getSettings(String filePath) throws IOException {
|
||||
String hypenationRulesFileName = "de_DR.xml";
|
||||
InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
|
||||
Path home = createTempDir();
|
||||
Path config = home.resolve("config");
|
||||
Files.createDirectory(config);
|
||||
Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));
|
||||
|
||||
return Settings.builder()
|
||||
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
|
||||
.loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -21,6 +21,51 @@
|
|||
"dict_dec":{
|
||||
"type":"dictionary_decompounder",
|
||||
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
|
||||
},
|
||||
"hyphenation_dec_only_longest_match": {
|
||||
"type": "hyphenation_decompounder",
|
||||
"hyphenation_patterns_path": "de_DR.xml",
|
||||
"word_list": [
|
||||
"fuss",
|
||||
"fussball",
|
||||
"ballpumpe",
|
||||
"ball",
|
||||
"pumpe",
|
||||
"kaffee",
|
||||
"fee",
|
||||
"maschine"
|
||||
],
|
||||
"only_longest_match": true
|
||||
},
|
||||
"hyphenation_dec_no_sub_matches": {
|
||||
"type": "hyphenation_decompounder",
|
||||
"hyphenation_patterns_path": "de_DR.xml",
|
||||
"word_list": [
|
||||
"fuss",
|
||||
"fussball",
|
||||
"ballpumpe",
|
||||
"ball",
|
||||
"pumpe",
|
||||
"kaffee",
|
||||
"fee",
|
||||
"maschine"
|
||||
],
|
||||
"no_sub_matches": true
|
||||
},
|
||||
"hyphenation_dec_no_overlapping_matches": {
|
||||
"type": "hyphenation_decompounder",
|
||||
"hyphenation_patterns_path": "de_DR.xml",
|
||||
"word_list": [
|
||||
"fuss",
|
||||
"fussball",
|
||||
"ballpumpe",
|
||||
"ball",
|
||||
"pumpe",
|
||||
"kaffee",
|
||||
"fee",
|
||||
"maschine"
|
||||
],
|
||||
"no_overlapping_matches": true
|
||||
}
|
||||
},
|
||||
"analyzer":{
|
||||
|
@ -47,6 +92,18 @@
|
|||
"decompoundingAnalyzer":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["dict_dec"]
|
||||
},
|
||||
"hyphenationDecompoundingAnalyzerOnlyLongestMatch":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["hyphenation_dec_only_longest_match"]
|
||||
},
|
||||
"hyphenationDecompoundingAnalyzerNoSubMatches": {
|
||||
"tokenizer":"standard",
|
||||
"filter":["hyphenation_dec_no_sub_matches"]
|
||||
},
|
||||
"hyphenationDecompoundingAnalyzerNoOverlappingMatches":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["hyphenation_dec_no_overlapping_matches"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,6 +15,21 @@ index :
|
|||
dict_dec :
|
||||
type : dictionary_decompounder
|
||||
word_list : [donau, dampf, schiff, spargel, creme, suppe]
|
||||
hyphenation_dec_only_longest_match :
|
||||
type : hyphenation_decompounder
|
||||
hyphenation_patterns_path : de_DR.xml
|
||||
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
|
||||
only_longest_match : true
|
||||
hyphenation_dec_no_sub_matches :
|
||||
type : hyphenation_decompounder
|
||||
hyphenation_patterns_path : de_DR.xml
|
||||
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
|
||||
no_sub_matches : true
|
||||
hyphenation_dec_no_overlapping_matches :
|
||||
type : hyphenation_decompounder
|
||||
hyphenation_patterns_path : de_DR.xml
|
||||
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
|
||||
no_overlapping_matches: true
|
||||
analyzer :
|
||||
standard :
|
||||
type : standard
|
||||
|
@ -37,3 +52,13 @@ index :
|
|||
decompoundingAnalyzer :
|
||||
tokenizer : standard
|
||||
filter : [dict_dec]
|
||||
hyphenationDecompoundingAnalyzerOnlyLongestMatch :
|
||||
tokenizer : standard
|
||||
filter : [hyphenation_dec_only_longest_match]
|
||||
hyphenationDecompoundingAnalyzerNoSubMatches:
|
||||
tokenizer: standard
|
||||
filter : [hyphenation_dec_no_sub_matches]
|
||||
hyphenationDecompoundingAnalyzerNoOverlappingMatches:
|
||||
tokenizer: standard
|
||||
filter : [hyphenation_dec_no_overlapping_matches]
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue