From bab6e835d4ae11d9f2f4592b440fa639b15dbc54 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 3 Jun 2025 11:17:34 +0200 Subject: [PATCH] Optimize sparse vector stats collection (#128740) (#128806) This change improves the performance of sparse vector statistics gathering by using the document count of terms directly, rather than relying on the field name field to compute stats. By avoiding per-term disk/network reads and instead leveraging statistics already loaded into leaf readers at index opening, we expect to significantly reduce overhead. Relates to #128583 --- docs/changelog/128740.yaml | 5 +++++ .../elasticsearch/index/engine/Engine.java | 19 +++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) create mode 100644 docs/changelog/128740.yaml diff --git a/docs/changelog/128740.yaml b/docs/changelog/128740.yaml new file mode 100644 index 000000000000..89ee856ce5a6 --- /dev/null +++ b/docs/changelog/128740.yaml @@ -0,0 +1,5 @@ +pr: 128740 +summary: Optimize sparse vector stats collection +area: Stats +type: enhancement +issues: [] diff --git a/server/src/main/java/org/elasticsearch/index/engine/Engine.java b/server/src/main/java/org/elasticsearch/index/engine/Engine.java index 36fd18144ad6..25660f54e849 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/Engine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/Engine.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.SegmentReader; import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -61,7 +60,6 @@ import org.elasticsearch.core.TimeValue; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.VersionType; import org.elasticsearch.index.mapper.DocumentParser; -import org.elasticsearch.index.mapper.FieldNamesFieldMapper; import org.elasticsearch.index.mapper.LuceneDocument; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.Mapping; @@ -337,14 +335,15 @@ public abstract class Engine implements Closeable { private long getSparseVectorValueCount(final LeafReader atomicReader, List fields) throws IOException { long count = 0; - Terms terms = atomicReader.terms(FieldNamesFieldMapper.NAME); - if (terms == null) { - return count; - } - TermsEnum termsEnum = terms.iterator(); - for (var fieldName : fields) { - if (termsEnum.seekExact(fieldName)) { - count += termsEnum.docFreq(); + for (var fieldNameBR : fields) { + var fieldName = fieldNameBR.utf8ToString(); + var fi = atomicReader.getFieldInfos().fieldInfo(fieldName); + if (fi == null) { + continue; + } + Terms terms = atomicReader.terms(fieldName); + if (terms != null) { + count += terms.getDocCount(); } } return count;