mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 09:28:55 -04:00
Improve execution of terms queries over wildcard fields (#128986)
This commit implements TermsQuery(Collection<?> values, @Nullable SearchExecutionContext context) in the WildCardFieldMapper to avoid memory pressure when building the query.
This commit is contained in:
parent
a9ea5c87eb
commit
ba6987ffe6
4 changed files with 223 additions and 0 deletions
6
docs/changelog/128986.yaml
Normal file
6
docs/changelog/128986.yaml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
pr: 128986
|
||||||
|
summary: Improve execution of terms queries over wildcard fields
|
||||||
|
area: Search
|
||||||
|
type: bug
|
||||||
|
issues:
|
||||||
|
- 128201
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
apply plugin: 'elasticsearch.internal-es-plugin'
|
apply plugin: 'elasticsearch.internal-es-plugin'
|
||||||
apply plugin: 'elasticsearch.internal-yaml-rest-test'
|
apply plugin: 'elasticsearch.internal-yaml-rest-test'
|
||||||
|
apply plugin: 'elasticsearch.internal-cluster-test'
|
||||||
|
|
||||||
esplugin {
|
esplugin {
|
||||||
name = 'wildcard'
|
name = 'wildcard'
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.xpack.wildcard.search;
|
||||||
|
|
||||||
|
import org.elasticsearch.action.bulk.BulkRequestBuilder;
|
||||||
|
import org.elasticsearch.action.index.IndexRequest;
|
||||||
|
import org.elasticsearch.index.query.TermQueryBuilder;
|
||||||
|
import org.elasticsearch.index.query.TermsQueryBuilder;
|
||||||
|
import org.elasticsearch.plugins.Plugin;
|
||||||
|
import org.elasticsearch.test.ESIntegTestCase;
|
||||||
|
import org.elasticsearch.xcontent.XContentBuilder;
|
||||||
|
import org.elasticsearch.xcontent.XContentFactory;
|
||||||
|
import org.elasticsearch.xcontent.XContentType;
|
||||||
|
import org.elasticsearch.xpack.wildcard.Wildcard;
|
||||||
|
import org.hamcrest.Matchers;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse;
|
||||||
|
|
||||||
|
public class WildcardSearchIT extends ESIntegTestCase {
|
||||||
|
|
||||||
|
private List<String> terms = null;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Collection<Class<? extends Plugin>> nodePlugins() {
|
||||||
|
return List.of(Wildcard.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() throws IOException {
|
||||||
|
terms = new ArrayList<>();
|
||||||
|
XContentBuilder xcb = XContentFactory.jsonBuilder()
|
||||||
|
.startObject()
|
||||||
|
.startObject("properties")
|
||||||
|
.startObject("wildcard")
|
||||||
|
.field("type", "wildcard")
|
||||||
|
.endObject()
|
||||||
|
.startObject("keyword")
|
||||||
|
.field("type", "keyword")
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject();
|
||||||
|
indicesAdmin().prepareCreate("test").setMapping(xcb).get();
|
||||||
|
final int numDocs = randomIntBetween(100, 1000);
|
||||||
|
final BulkRequestBuilder builder = client().prepareBulk();
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
if (rarely()) {
|
||||||
|
indexMultiValue(builder);
|
||||||
|
} else {
|
||||||
|
indexSingleValue(builder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertFalse(builder.get().hasFailures());
|
||||||
|
indicesAdmin().prepareRefresh("test").get();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void indexSingleValue(BulkRequestBuilder builder) {
|
||||||
|
String term = randomIndexString();
|
||||||
|
builder.add(
|
||||||
|
new IndexRequest("test").source("{\"wildcard\" : \"" + term + "\", \"keyword\" : \"" + term + "\"}", XContentType.JSON)
|
||||||
|
);
|
||||||
|
terms.add(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void indexMultiValue(BulkRequestBuilder builder) {
|
||||||
|
int docSize = randomIntBetween(1, 10);
|
||||||
|
String[] docTerms = new String[docSize];
|
||||||
|
for (int i = 0; i < docSize; i++) {
|
||||||
|
String term = randomIndexString();
|
||||||
|
terms.add(term);
|
||||||
|
docTerms[i] = "\"" + term + "\"";
|
||||||
|
}
|
||||||
|
builder.add(
|
||||||
|
new IndexRequest("test").source(
|
||||||
|
"{\"wildcard\" : " + Arrays.toString(docTerms) + ", \"keyword\" : " + Arrays.toString(docTerms) + "}",
|
||||||
|
XContentType.JSON
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTermQueryDuel() {
|
||||||
|
for (int i = 0; i < 50; i++) {
|
||||||
|
String term = randomQueryString(terms);
|
||||||
|
TermQueryBuilder termQueryBuilder1 = new TermQueryBuilder("wildcard", term);
|
||||||
|
TermQueryBuilder termQueryBuilder2 = new TermQueryBuilder("keyword", term);
|
||||||
|
assertResponse(
|
||||||
|
client().prepareSearch("test").setQuery(termQueryBuilder1),
|
||||||
|
response -> assertResponse(
|
||||||
|
client().prepareSearch("test").setQuery(termQueryBuilder2),
|
||||||
|
response2 -> assertThat(
|
||||||
|
response.getHits().getTotalHits().value(),
|
||||||
|
Matchers.equalTo(response2.getHits().getTotalHits().value())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTermsQueryDuel() {
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
String[] terms = new String[randomIntBetween(2, 8192)];
|
||||||
|
for (int j = 0; j < terms.length; j++) {
|
||||||
|
terms[j] = randomQueryString(this.terms);
|
||||||
|
}
|
||||||
|
TermsQueryBuilder termsQueryBuilder1 = new TermsQueryBuilder("wildcard", terms);
|
||||||
|
TermsQueryBuilder termsQueryBuilder2 = new TermsQueryBuilder("keyword", terms);
|
||||||
|
assertResponse(
|
||||||
|
client().prepareSearch("test").setQuery(termsQueryBuilder1),
|
||||||
|
response -> assertResponse(
|
||||||
|
client().prepareSearch("test").setQuery(termsQueryBuilder2),
|
||||||
|
response2 -> assertThat(
|
||||||
|
response.getHits().getTotalHits().value(),
|
||||||
|
Matchers.equalTo(response2.getHits().getTotalHits().value())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String randomIndexString() {
|
||||||
|
String string = randomAlphaOfLength(randomIntBetween(0, 30));
|
||||||
|
if (rarely()) {
|
||||||
|
return string + "*";
|
||||||
|
} else if (rarely()) {
|
||||||
|
return "*" + string;
|
||||||
|
} else if (rarely()) {
|
||||||
|
return "*" + string + "*";
|
||||||
|
} else {
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String randomQueryString(List<String> terms) {
|
||||||
|
if (rarely()) {
|
||||||
|
return terms.get(randomIntBetween(0, terms.size() - 1));
|
||||||
|
} else if (randomBoolean()) {
|
||||||
|
return randomAlphaOfLength(randomIntBetween(0, 30));
|
||||||
|
} else {
|
||||||
|
return randomAlphaOfLength(1) + "*";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
import org.apache.lucene.search.FieldExistsQuery;
|
import org.apache.lucene.search.FieldExistsQuery;
|
||||||
import org.apache.lucene.search.FuzzyQuery;
|
import org.apache.lucene.search.FuzzyQuery;
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
@ -33,6 +34,7 @@ import org.apache.lucene.search.MultiTermQuery;
|
||||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||||
import org.apache.lucene.search.PrefixQuery;
|
import org.apache.lucene.search.PrefixQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermInSetQuery;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TermRangeQuery;
|
import org.apache.lucene.search.TermRangeQuery;
|
||||||
import org.apache.lucene.search.WildcardQuery;
|
import org.apache.lucene.search.WildcardQuery;
|
||||||
|
@ -80,11 +82,14 @@ import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.ZoneId;
|
import java.time.ZoneId;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import static org.elasticsearch.index.IndexSettings.IGNORE_ABOVE_SETTING;
|
import static org.elasticsearch.index.IndexSettings.IGNORE_ABOVE_SETTING;
|
||||||
|
|
||||||
|
@ -95,6 +100,7 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
|
|
||||||
public static final String CONTENT_TYPE = "wildcard";
|
public static final String CONTENT_TYPE = "wildcard";
|
||||||
public static final short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
|
public static final short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
|
||||||
|
private static final int WILDCARD_TERMS_EXPANSION_LIMIT = 16;
|
||||||
public static final int NGRAM_SIZE = 3;
|
public static final int NGRAM_SIZE = 3;
|
||||||
|
|
||||||
static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
|
static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
|
||||||
|
@ -859,6 +865,63 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Query termsQuery(Collection<?> values, @Nullable SearchExecutionContext context) {
|
||||||
|
final BytesRef[] terms = buildTerms(values);
|
||||||
|
final Query aproxQuery;
|
||||||
|
if (terms.length < WILDCARD_TERMS_EXPANSION_LIMIT) {
|
||||||
|
// If there are few terms, we can approximate each term using a BooleanQuery.
|
||||||
|
final BooleanQuery.Builder builder = new BooleanQuery.Builder();
|
||||||
|
for (BytesRef term : terms) {
|
||||||
|
final BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||||
|
final Integer numClauses = getApproxWildCardQuery(escapeWildcardSyntax(term.utf8ToString()), rewritten);
|
||||||
|
if (numClauses != null && numClauses > 0) {
|
||||||
|
builder.add(rewritten.build(), Occur.SHOULD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
aproxQuery = builder.build();
|
||||||
|
} else {
|
||||||
|
// If there are too many terms, we cannot rewrite approximate into a BooleanQuery as it will use too much memory.
|
||||||
|
// Instead, we generate a TermInSetQuery. In order to match the necessary documents we need to add at least one token
|
||||||
|
// per term, ideally we should add the token that makes the term most different from the others.
|
||||||
|
final Set<String> tokens = new LinkedHashSet<>();
|
||||||
|
final Set<BytesRef> tokenList = new TreeSet<>();
|
||||||
|
for (BytesRef term : terms) {
|
||||||
|
// Break search term into tokens
|
||||||
|
final boolean matchAll = breakIntoTokens(escapeWildcardSyntax(term.utf8ToString()), tokens);
|
||||||
|
assert matchAll == false;
|
||||||
|
if (tokens.isEmpty() == false) {
|
||||||
|
// If there are tokens, we take the middle one to represent the term
|
||||||
|
// which is probably the most different one.
|
||||||
|
tokenList.add(getMiddleToken(tokens));
|
||||||
|
}
|
||||||
|
tokens.clear();
|
||||||
|
}
|
||||||
|
aproxQuery = new TermInSetQuery(name(), tokenList);
|
||||||
|
}
|
||||||
|
return BinaryDvConfirmedQuery.fromTerms(new ConstantScoreQuery(aproxQuery), name(), terms);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static BytesRef getMiddleToken(Set<String> tokens) {
|
||||||
|
int mid = (tokens.size() + 1) / 2;
|
||||||
|
Iterator<String> iterator = tokens.iterator();
|
||||||
|
for (int i = 0; i < mid - 1; i++) {
|
||||||
|
iterator.next();
|
||||||
|
}
|
||||||
|
assert iterator.hasNext();
|
||||||
|
return BytesRefs.toBytesRef(iterator.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static BytesRef[] buildTerms(Collection<?> values) {
|
||||||
|
final Set<?> dedupe = new HashSet<>(values);
|
||||||
|
final BytesRef[] terms = new BytesRef[dedupe.size()];
|
||||||
|
final Iterator<?> iterator = dedupe.iterator();
|
||||||
|
for (int i = 0; i < dedupe.size(); i++) {
|
||||||
|
terms[i] = BytesRefs.toBytesRef(iterator.next());
|
||||||
|
}
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
private static String escapeWildcardSyntax(String term) {
|
private static String escapeWildcardSyntax(String term) {
|
||||||
StringBuilder result = new StringBuilder();
|
StringBuilder result = new StringBuilder();
|
||||||
for (int i = 0; i < term.length();) {
|
for (int i = 0; i < term.length();) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue