mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 17:34:17 -04:00
Provide access to new settings for HyphenationCompoundWordTokenFilter (#115585)
Allow the new flags added in Lucene in the HyphenationCompoundWordTokenFilter Adds access to the two new flags no_sub_matches and no_overlapping_matches. Lucene issue: https://github.com/apache/lucene/issues/9231
This commit is contained in:
parent
99689281e0
commit
c804953105
7 changed files with 1295 additions and 11 deletions
6
docs/changelog/115585.yaml
Normal file
6
docs/changelog/115585.yaml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
pr: 115459
|
||||||
|
summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
|
||||||
|
area: Search
|
||||||
|
type: enhancement
|
||||||
|
issues:
|
||||||
|
- 97849
|
|
@ -111,6 +111,18 @@ output. Defaults to `5`.
|
||||||
(Optional, Boolean)
|
(Optional, Boolean)
|
||||||
If `true`, only include the longest matching subword. Defaults to `false`.
|
If `true`, only include the longest matching subword. Defaults to `false`.
|
||||||
|
|
||||||
|
`no_sub_matches`::
|
||||||
|
(Optional, Boolean)
|
||||||
|
If `true`, do not match sub tokens in tokens that are in the word list.
|
||||||
|
Defaults to `false`.
|
||||||
|
|
||||||
|
`no_overlapping_matches`::
|
||||||
|
(Optional, Boolean)
|
||||||
|
If `true`, do not allow overlapping tokens.
|
||||||
|
Defaults to `false`.
|
||||||
|
|
||||||
|
Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.
|
||||||
|
|
||||||
[[analysis-hyp-decomp-tokenfilter-customize]]
|
[[analysis-hyp-decomp-tokenfilter-customize]]
|
||||||
==== Customize and add to an analyzer
|
==== Customize and add to an analyzer
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,8 @@ import java.nio.file.Path;
|
||||||
*/
|
*/
|
||||||
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
|
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
|
||||||
|
|
||||||
|
private final boolean noSubMatches;
|
||||||
|
private final boolean noOverlappingMatches;
|
||||||
private final HyphenationTree hyphenationTree;
|
private final HyphenationTree hyphenationTree;
|
||||||
|
|
||||||
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
|
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
noSubMatches = settings.getAsBoolean("no_sub_matches", false);
|
||||||
|
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -57,7 +62,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
||||||
minWordSize,
|
minWordSize,
|
||||||
minSubwordSize,
|
minSubwordSize,
|
||||||
maxSubwordSize,
|
maxSubwordSize,
|
||||||
onlyLongestMatch
|
onlyLongestMatch,
|
||||||
|
noSubMatches,
|
||||||
|
noOverlappingMatches
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,9 @@ import org.elasticsearch.test.IndexSettingsModule;
|
||||||
import org.hamcrest.MatcherAssert;
|
import org.hamcrest.MatcherAssert;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -42,6 +45,7 @@ import static org.hamcrest.Matchers.hasItems;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
public class CompoundAnalysisTests extends ESTestCase {
|
public class CompoundAnalysisTests extends ESTestCase {
|
||||||
|
|
||||||
public void testDefaultsCompoundAnalysis() throws Exception {
|
public void testDefaultsCompoundAnalysis() throws Exception {
|
||||||
Settings settings = getJsonSettings();
|
Settings settings = getJsonSettings();
|
||||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||||
|
@ -63,6 +67,44 @@ public class CompoundAnalysisTests extends ESTestCase {
|
||||||
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
|
||||||
|
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||||
|
for (Settings settings : settingsArr) {
|
||||||
|
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
|
||||||
|
MatcherAssert.assertThat(
|
||||||
|
terms,
|
||||||
|
hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For example given a word list of: ["kaffee", "fee", "maschine"]
|
||||||
|
* no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
|
||||||
|
*/
|
||||||
|
public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
|
||||||
|
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||||
|
for (Settings settings : settingsArr) {
|
||||||
|
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
|
||||||
|
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
|
||||||
|
}
|
||||||
|
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For example given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
|
||||||
|
* no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
|
||||||
|
*/
|
||||||
|
public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
|
||||||
|
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||||
|
for (Settings settings : settingsArr) {
|
||||||
|
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
|
||||||
|
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
|
||||||
|
}
|
||||||
|
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
|
||||||
|
}
|
||||||
|
|
||||||
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
|
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
|
||||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||||
AnalysisModule analysisModule = createAnalysisModule(settings);
|
AnalysisModule analysisModule = createAnalysisModule(settings);
|
||||||
|
@ -92,20 +134,25 @@ public class CompoundAnalysisTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private Settings getJsonSettings() throws IOException {
|
private Settings getJsonSettings() throws IOException {
|
||||||
String json = "/org/elasticsearch/analysis/common/test1.json";
|
return getSettings("/org/elasticsearch/analysis/common/test1.json");
|
||||||
return Settings.builder()
|
|
||||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
|
||||||
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
|
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
|
||||||
.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Settings getYamlSettings() throws IOException {
|
private Settings getYamlSettings() throws IOException {
|
||||||
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
|
return getSettings("/org/elasticsearch/analysis/common/test1.yml");
|
||||||
|
}
|
||||||
|
|
||||||
|
private Settings getSettings(String filePath) throws IOException {
|
||||||
|
String hypenationRulesFileName = "de_DR.xml";
|
||||||
|
InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
|
||||||
|
Path home = createTempDir();
|
||||||
|
Path config = home.resolve("config");
|
||||||
|
Files.createDirectory(config);
|
||||||
|
Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));
|
||||||
|
|
||||||
return Settings.builder()
|
return Settings.builder()
|
||||||
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
|
.loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
|
||||||
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
|
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -21,6 +21,51 @@
|
||||||
"dict_dec":{
|
"dict_dec":{
|
||||||
"type":"dictionary_decompounder",
|
"type":"dictionary_decompounder",
|
||||||
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
|
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
|
||||||
|
},
|
||||||
|
"hyphenation_dec_only_longest_match": {
|
||||||
|
"type": "hyphenation_decompounder",
|
||||||
|
"hyphenation_patterns_path": "de_DR.xml",
|
||||||
|
"word_list": [
|
||||||
|
"fuss",
|
||||||
|
"fussball",
|
||||||
|
"ballpumpe",
|
||||||
|
"ball",
|
||||||
|
"pumpe",
|
||||||
|
"kaffee",
|
||||||
|
"fee",
|
||||||
|
"maschine"
|
||||||
|
],
|
||||||
|
"only_longest_match": true
|
||||||
|
},
|
||||||
|
"hyphenation_dec_no_sub_matches": {
|
||||||
|
"type": "hyphenation_decompounder",
|
||||||
|
"hyphenation_patterns_path": "de_DR.xml",
|
||||||
|
"word_list": [
|
||||||
|
"fuss",
|
||||||
|
"fussball",
|
||||||
|
"ballpumpe",
|
||||||
|
"ball",
|
||||||
|
"pumpe",
|
||||||
|
"kaffee",
|
||||||
|
"fee",
|
||||||
|
"maschine"
|
||||||
|
],
|
||||||
|
"no_sub_matches": true
|
||||||
|
},
|
||||||
|
"hyphenation_dec_no_overlapping_matches": {
|
||||||
|
"type": "hyphenation_decompounder",
|
||||||
|
"hyphenation_patterns_path": "de_DR.xml",
|
||||||
|
"word_list": [
|
||||||
|
"fuss",
|
||||||
|
"fussball",
|
||||||
|
"ballpumpe",
|
||||||
|
"ball",
|
||||||
|
"pumpe",
|
||||||
|
"kaffee",
|
||||||
|
"fee",
|
||||||
|
"maschine"
|
||||||
|
],
|
||||||
|
"no_overlapping_matches": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"analyzer":{
|
"analyzer":{
|
||||||
|
@ -47,6 +92,18 @@
|
||||||
"decompoundingAnalyzer":{
|
"decompoundingAnalyzer":{
|
||||||
"tokenizer":"standard",
|
"tokenizer":"standard",
|
||||||
"filter":["dict_dec"]
|
"filter":["dict_dec"]
|
||||||
|
},
|
||||||
|
"hyphenationDecompoundingAnalyzerOnlyLongestMatch":{
|
||||||
|
"tokenizer":"standard",
|
||||||
|
"filter":["hyphenation_dec_only_longest_match"]
|
||||||
|
},
|
||||||
|
"hyphenationDecompoundingAnalyzerNoSubMatches": {
|
||||||
|
"tokenizer":"standard",
|
||||||
|
"filter":["hyphenation_dec_no_sub_matches"]
|
||||||
|
},
|
||||||
|
"hyphenationDecompoundingAnalyzerNoOverlappingMatches":{
|
||||||
|
"tokenizer":"standard",
|
||||||
|
"filter":["hyphenation_dec_no_overlapping_matches"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,6 +15,21 @@ index :
|
||||||
dict_dec :
|
dict_dec :
|
||||||
type : dictionary_decompounder
|
type : dictionary_decompounder
|
||||||
word_list : [donau, dampf, schiff, spargel, creme, suppe]
|
word_list : [donau, dampf, schiff, spargel, creme, suppe]
|
||||||
|
hyphenation_dec_only_longest_match :
|
||||||
|
type : hyphenation_decompounder
|
||||||
|
hyphenation_patterns_path : de_DR.xml
|
||||||
|
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
|
||||||
|
only_longest_match : true
|
||||||
|
hyphenation_dec_no_sub_matches :
|
||||||
|
type : hyphenation_decompounder
|
||||||
|
hyphenation_patterns_path : de_DR.xml
|
||||||
|
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
|
||||||
|
no_sub_matches : true
|
||||||
|
hyphenation_dec_no_overlapping_matches :
|
||||||
|
type : hyphenation_decompounder
|
||||||
|
hyphenation_patterns_path : de_DR.xml
|
||||||
|
word_list : [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]
|
||||||
|
no_overlapping_matches: true
|
||||||
analyzer :
|
analyzer :
|
||||||
standard :
|
standard :
|
||||||
type : standard
|
type : standard
|
||||||
|
@ -37,3 +52,13 @@ index :
|
||||||
decompoundingAnalyzer :
|
decompoundingAnalyzer :
|
||||||
tokenizer : standard
|
tokenizer : standard
|
||||||
filter : [dict_dec]
|
filter : [dict_dec]
|
||||||
|
hyphenationDecompoundingAnalyzerOnlyLongestMatch :
|
||||||
|
tokenizer : standard
|
||||||
|
filter : [hyphenation_dec_only_longest_match]
|
||||||
|
hyphenationDecompoundingAnalyzerNoSubMatches:
|
||||||
|
tokenizer: standard
|
||||||
|
filter : [hyphenation_dec_no_sub_matches]
|
||||||
|
hyphenationDecompoundingAnalyzerNoOverlappingMatches:
|
||||||
|
tokenizer: standard
|
||||||
|
filter : [hyphenation_dec_no_overlapping_matches]
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue