Optimize sparse vector stats collection (#128740) (#128806)

This change improves the performance of sparse vector statistics gathering by using the document count of terms directly, rather than relying on the field name field to compute stats.
By avoiding per-term disk/network reads and instead leveraging statistics already loaded into leaf readers at index opening, we expect to significantly reduce overhead.

Relates to #128583
This commit is contained in:
Jim Ferenczi 2025-06-03 11:17:34 +02:00 committed by GitHub
parent 1b03d0956d
commit bab6e835d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 14 additions and 10 deletions

View file

@ -0,0 +1,5 @@
pr: 128740
summary: Optimize sparse vector stats collection
area: Stats
type: enhancement
issues: []

View file

@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SegmentReader; import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCache;
import org.apache.lucene.search.QueryCachingPolicy; import org.apache.lucene.search.QueryCachingPolicy;
@ -61,7 +60,6 @@ import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.VersionType; import org.elasticsearch.index.VersionType;
import org.elasticsearch.index.mapper.DocumentParser; import org.elasticsearch.index.mapper.DocumentParser;
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
import org.elasticsearch.index.mapper.LuceneDocument; import org.elasticsearch.index.mapper.LuceneDocument;
import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.Mapping; import org.elasticsearch.index.mapper.Mapping;
@ -337,14 +335,15 @@ public abstract class Engine implements Closeable {
private long getSparseVectorValueCount(final LeafReader atomicReader, List<BytesRef> fields) throws IOException { private long getSparseVectorValueCount(final LeafReader atomicReader, List<BytesRef> fields) throws IOException {
long count = 0; long count = 0;
Terms terms = atomicReader.terms(FieldNamesFieldMapper.NAME); for (var fieldNameBR : fields) {
if (terms == null) { var fieldName = fieldNameBR.utf8ToString();
return count; var fi = atomicReader.getFieldInfos().fieldInfo(fieldName);
} if (fi == null) {
TermsEnum termsEnum = terms.iterator(); continue;
for (var fieldName : fields) { }
if (termsEnum.seekExact(fieldName)) { Terms terms = atomicReader.terms(fieldName);
count += termsEnum.docFreq(); if (terms != null) {
count += terms.getDocCount();
} }
} }
return count; return count;