Fix Synthetic Source Handling for bit Type in dense_vector Field (#114407)

**Description:**

This PR addresses the issue described in [#114402](https://github.com/elastic/elasticsearch/issues/114402), where the `synthetic_source` feature does not correctly handle the `bit` type in `dense_vector` fields when `index` is set to `false`. The root cause of the issue was that the `bit` type was not properly accounted for, leading to an array that is 8 times the size of the actual `dims` value of docvalue. This mismatch will causes an array out-of-bounds exception when reconstructing the document.

**Changes:**

- Adjusted the `synthetic_source` logic to correctly handle the `bit` type by ensuring the array size accounts for the 8x difference in dimensions.
- Added yaml test to cover the `bit` type scenario in `dense_vector` fields with `index` set to `false`.

**Related Issues:**

- Closes [#114402](https://github.com/elastic/elasticsearch/issues/114402)
- Introduced in [#110059](https://github.com/elastic/elasticsearch/pull/110059)
This commit is contained in:
Rassyan 2024-10-15 02:59:56 +08:00 committed by GitHub
parent 35e79f85f0
commit 465c65c02f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 86 additions and 8 deletions

View file

@ -0,0 +1,6 @@
pr: 114407
summary: Fix synthetic source handling for `bit` type in `dense_vector` field
area: Search
type: bug
issues:
- 114402

View file

@ -354,3 +354,54 @@ setup:
dims: 40 dims: 40
index: true index: true
similarity: max_inner_product similarity: max_inner_product
---
"Search with synthetic source":
- requires:
capabilities:
- method: POST
path: /_search
capabilities: [ bit_dense_vector_synthetic_source ]
test_runner_features: capabilities
reason: "Support for bit dense vector synthetic source capability required"
- do:
indices.create:
index: test_synthetic_source
body:
mappings:
properties:
name:
type: keyword
vector1:
type: dense_vector
element_type: bit
dims: 40
index: false
vector2:
type: dense_vector
element_type: bit
dims: 40
index: true
similarity: l2_norm
- do:
index:
index: test_synthetic_source
id: "1"
body:
name: cow.jpg
vector1: [2, -1, 1, 4, -3]
vector2: [2, -1, 1, 4, -3]
- do:
indices.refresh: {}
- do:
search:
force_synthetic_source: true
index: test_synthetic_source
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.0._source.vector1: [2, -1, 1, 4, -3]}
- match: {hits.hits.0._source.vector2: [2, -1, 1, 4, -3]}

View file

@ -41,6 +41,7 @@ import org.elasticsearch.simdvec.VectorSimilarityType;
import java.io.IOException; import java.io.IOException;
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL; import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL;
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;
public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat { public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat {
@ -291,4 +292,9 @@ public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat {
return delegate.getRandomVectorScorer(sim, values, query); return delegate.getRandomVectorScorer(sim, values, query);
} }
} }
@Override
public int getMaxDimensions(String fieldName) {
return MAX_DIMS_COUNT;
}
} }

View file

@ -25,6 +25,8 @@ import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues
import java.io.IOException; import java.io.IOException;
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;
class ES815BitFlatVectorsFormat extends FlatVectorsFormat { class ES815BitFlatVectorsFormat extends FlatVectorsFormat {
private static final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE); private static final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE);
@ -43,6 +45,11 @@ class ES815BitFlatVectorsFormat extends FlatVectorsFormat {
return delegate.fieldsReader(segmentReadState); return delegate.fieldsReader(segmentReadState);
} }
@Override
public int getMaxDimensions(String fieldName) {
return MAX_DIMS_COUNT;
}
static class FlatBitVectorScorer implements FlatVectorsScorer { static class FlatBitVectorScorer implements FlatVectorsScorer {
static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer(); static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer();

View file

@ -2270,7 +2270,7 @@ public class DenseVectorFieldMapper extends FieldMapper {
if (indexCreatedVersion.onOrAfter(LITTLE_ENDIAN_FLOAT_STORED_INDEX_VERSION)) { if (indexCreatedVersion.onOrAfter(LITTLE_ENDIAN_FLOAT_STORED_INDEX_VERSION)) {
byteBuffer.order(ByteOrder.LITTLE_ENDIAN); byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
} }
int dims = fieldType().dims; int dims = fieldType().elementType == ElementType.BIT ? fieldType().dims / Byte.SIZE : fieldType().dims;
for (int dim = 0; dim < dims; dim++) { for (int dim = 0; dim < dims; dim++) {
fieldType().elementType.readAndWriteValue(byteBuffer, b); fieldType().elementType.readAndWriteValue(byteBuffer, b);
} }

View file

@ -20,6 +20,11 @@ public final class SearchCapabilities {
/** Support regex and range match rules in interval queries. */ /** Support regex and range match rules in interval queries. */
private static final String RANGE_REGEX_INTERVAL_QUERY_CAPABILITY = "range_regexp_interval_queries"; private static final String RANGE_REGEX_INTERVAL_QUERY_CAPABILITY = "range_regexp_interval_queries";
/** Support synthetic source with `bit` type in `dense_vector` field when `index` is set to `false`. */
private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source";
public static final Set<String> CAPABILITIES = Set.of(RANGE_REGEX_INTERVAL_QUERY_CAPABILITY); public static final Set<String> CAPABILITIES = Set.of(
RANGE_REGEX_INTERVAL_QUERY_CAPABILITY,
BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY
);
} }

View file

@ -2022,24 +2022,27 @@ public class DenseVectorFieldMapperTests extends MapperTestCase {
private static class DenseVectorSyntheticSourceSupport implements SyntheticSourceSupport { private static class DenseVectorSyntheticSourceSupport implements SyntheticSourceSupport {
private final int dims = between(5, 1000); private final int dims = between(5, 1000);
private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT); private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT);
private final boolean indexed = randomBoolean(); private final boolean indexed = randomBoolean();
private final boolean indexOptionsSet = indexed && randomBoolean(); private final boolean indexOptionsSet = indexed && randomBoolean();
@Override @Override
public SyntheticSourceExample example(int maxValues) throws IOException { public SyntheticSourceExample example(int maxValues) throws IOException {
Object value = elementType == ElementType.BYTE Object value = switch (elementType) {
? randomList(dims, dims, ESTestCase::randomByte) case BYTE, BIT:
: randomList(dims, dims, ESTestCase::randomFloat); yield randomList(dims, dims, ESTestCase::randomByte);
case FLOAT:
yield randomList(dims, dims, ESTestCase::randomFloat);
};
return new SyntheticSourceExample(value, value, this::mapping); return new SyntheticSourceExample(value, value, this::mapping);
} }
private void mapping(XContentBuilder b) throws IOException { private void mapping(XContentBuilder b) throws IOException {
b.field("type", "dense_vector"); b.field("type", "dense_vector");
b.field("dims", dims); if (elementType == ElementType.BYTE || elementType == ElementType.BIT || randomBoolean()) {
if (elementType == ElementType.BYTE || randomBoolean()) {
b.field("element_type", elementType.toString()); b.field("element_type", elementType.toString());
} }
b.field("dims", elementType == ElementType.BIT ? dims * Byte.SIZE : dims);
if (indexed) { if (indexed) {
b.field("index", true); b.field("index", true);
b.field("similarity", "l2_norm"); b.field("similarity", "l2_norm");