mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-29 18:03:32 -04:00
Fix Synthetic Source Handling for bit
Type in dense_vector
Field (#114407)
**Description:** This PR addresses the issue described in [#114402](https://github.com/elastic/elasticsearch/issues/114402), where the `synthetic_source` feature does not correctly handle the `bit` type in `dense_vector` fields when `index` is set to `false`. The root cause of the issue was that the `bit` type was not properly accounted for, leading to an array that is 8 times the size of the actual `dims` value of docvalue. This mismatch will causes an array out-of-bounds exception when reconstructing the document. **Changes:** - Adjusted the `synthetic_source` logic to correctly handle the `bit` type by ensuring the array size accounts for the 8x difference in dimensions. - Added yaml test to cover the `bit` type scenario in `dense_vector` fields with `index` set to `false`. **Related Issues:** - Closes [#114402](https://github.com/elastic/elasticsearch/issues/114402) - Introduced in [#110059](https://github.com/elastic/elasticsearch/pull/110059)
This commit is contained in:
parent
35e79f85f0
commit
465c65c02f
7 changed files with 86 additions and 8 deletions
6
docs/changelog/114407.yaml
Normal file
6
docs/changelog/114407.yaml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
pr: 114407
|
||||||
|
summary: Fix synthetic source handling for `bit` type in `dense_vector` field
|
||||||
|
area: Search
|
||||||
|
type: bug
|
||||||
|
issues:
|
||||||
|
- 114402
|
|
@ -354,3 +354,54 @@ setup:
|
||||||
dims: 40
|
dims: 40
|
||||||
index: true
|
index: true
|
||||||
similarity: max_inner_product
|
similarity: max_inner_product
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
"Search with synthetic source":
|
||||||
|
- requires:
|
||||||
|
capabilities:
|
||||||
|
- method: POST
|
||||||
|
path: /_search
|
||||||
|
capabilities: [ bit_dense_vector_synthetic_source ]
|
||||||
|
test_runner_features: capabilities
|
||||||
|
reason: "Support for bit dense vector synthetic source capability required"
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: test_synthetic_source
|
||||||
|
body:
|
||||||
|
mappings:
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
type: keyword
|
||||||
|
vector1:
|
||||||
|
type: dense_vector
|
||||||
|
element_type: bit
|
||||||
|
dims: 40
|
||||||
|
index: false
|
||||||
|
vector2:
|
||||||
|
type: dense_vector
|
||||||
|
element_type: bit
|
||||||
|
dims: 40
|
||||||
|
index: true
|
||||||
|
similarity: l2_norm
|
||||||
|
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: test_synthetic_source
|
||||||
|
id: "1"
|
||||||
|
body:
|
||||||
|
name: cow.jpg
|
||||||
|
vector1: [2, -1, 1, 4, -3]
|
||||||
|
vector2: [2, -1, 1, 4, -3]
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.refresh: {}
|
||||||
|
|
||||||
|
- do:
|
||||||
|
search:
|
||||||
|
force_synthetic_source: true
|
||||||
|
index: test_synthetic_source
|
||||||
|
|
||||||
|
- match: {hits.hits.0._id: "1"}
|
||||||
|
- match: {hits.hits.0._source.vector1: [2, -1, 1, 4, -3]}
|
||||||
|
- match: {hits.hits.0._source.vector2: [2, -1, 1, 4, -3]}
|
||||||
|
|
|
@ -41,6 +41,7 @@ import org.elasticsearch.simdvec.VectorSimilarityType;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL;
|
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL;
|
||||||
|
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;
|
||||||
|
|
||||||
public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat {
|
public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat {
|
||||||
|
|
||||||
|
@ -291,4 +292,9 @@ public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat {
|
||||||
return delegate.getRandomVectorScorer(sim, values, query);
|
return delegate.getRandomVectorScorer(sim, values, query);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getMaxDimensions(String fieldName) {
|
||||||
|
return MAX_DIMS_COUNT;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,8 @@ import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;
|
||||||
|
|
||||||
class ES815BitFlatVectorsFormat extends FlatVectorsFormat {
|
class ES815BitFlatVectorsFormat extends FlatVectorsFormat {
|
||||||
|
|
||||||
private static final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE);
|
private static final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE);
|
||||||
|
@ -43,6 +45,11 @@ class ES815BitFlatVectorsFormat extends FlatVectorsFormat {
|
||||||
return delegate.fieldsReader(segmentReadState);
|
return delegate.fieldsReader(segmentReadState);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getMaxDimensions(String fieldName) {
|
||||||
|
return MAX_DIMS_COUNT;
|
||||||
|
}
|
||||||
|
|
||||||
static class FlatBitVectorScorer implements FlatVectorsScorer {
|
static class FlatBitVectorScorer implements FlatVectorsScorer {
|
||||||
|
|
||||||
static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer();
|
static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer();
|
||||||
|
|
|
@ -2270,7 +2270,7 @@ public class DenseVectorFieldMapper extends FieldMapper {
|
||||||
if (indexCreatedVersion.onOrAfter(LITTLE_ENDIAN_FLOAT_STORED_INDEX_VERSION)) {
|
if (indexCreatedVersion.onOrAfter(LITTLE_ENDIAN_FLOAT_STORED_INDEX_VERSION)) {
|
||||||
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||||
}
|
}
|
||||||
int dims = fieldType().dims;
|
int dims = fieldType().elementType == ElementType.BIT ? fieldType().dims / Byte.SIZE : fieldType().dims;
|
||||||
for (int dim = 0; dim < dims; dim++) {
|
for (int dim = 0; dim < dims; dim++) {
|
||||||
fieldType().elementType.readAndWriteValue(byteBuffer, b);
|
fieldType().elementType.readAndWriteValue(byteBuffer, b);
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,11 @@ public final class SearchCapabilities {
|
||||||
|
|
||||||
/** Support regex and range match rules in interval queries. */
|
/** Support regex and range match rules in interval queries. */
|
||||||
private static final String RANGE_REGEX_INTERVAL_QUERY_CAPABILITY = "range_regexp_interval_queries";
|
private static final String RANGE_REGEX_INTERVAL_QUERY_CAPABILITY = "range_regexp_interval_queries";
|
||||||
|
/** Support synthetic source with `bit` type in `dense_vector` field when `index` is set to `false`. */
|
||||||
|
private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source";
|
||||||
|
|
||||||
public static final Set<String> CAPABILITIES = Set.of(RANGE_REGEX_INTERVAL_QUERY_CAPABILITY);
|
public static final Set<String> CAPABILITIES = Set.of(
|
||||||
|
RANGE_REGEX_INTERVAL_QUERY_CAPABILITY,
|
||||||
|
BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2022,24 +2022,27 @@ public class DenseVectorFieldMapperTests extends MapperTestCase {
|
||||||
|
|
||||||
private static class DenseVectorSyntheticSourceSupport implements SyntheticSourceSupport {
|
private static class DenseVectorSyntheticSourceSupport implements SyntheticSourceSupport {
|
||||||
private final int dims = between(5, 1000);
|
private final int dims = between(5, 1000);
|
||||||
private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT);
|
private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT);
|
||||||
private final boolean indexed = randomBoolean();
|
private final boolean indexed = randomBoolean();
|
||||||
private final boolean indexOptionsSet = indexed && randomBoolean();
|
private final boolean indexOptionsSet = indexed && randomBoolean();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SyntheticSourceExample example(int maxValues) throws IOException {
|
public SyntheticSourceExample example(int maxValues) throws IOException {
|
||||||
Object value = elementType == ElementType.BYTE
|
Object value = switch (elementType) {
|
||||||
? randomList(dims, dims, ESTestCase::randomByte)
|
case BYTE, BIT:
|
||||||
: randomList(dims, dims, ESTestCase::randomFloat);
|
yield randomList(dims, dims, ESTestCase::randomByte);
|
||||||
|
case FLOAT:
|
||||||
|
yield randomList(dims, dims, ESTestCase::randomFloat);
|
||||||
|
};
|
||||||
return new SyntheticSourceExample(value, value, this::mapping);
|
return new SyntheticSourceExample(value, value, this::mapping);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void mapping(XContentBuilder b) throws IOException {
|
private void mapping(XContentBuilder b) throws IOException {
|
||||||
b.field("type", "dense_vector");
|
b.field("type", "dense_vector");
|
||||||
b.field("dims", dims);
|
if (elementType == ElementType.BYTE || elementType == ElementType.BIT || randomBoolean()) {
|
||||||
if (elementType == ElementType.BYTE || randomBoolean()) {
|
|
||||||
b.field("element_type", elementType.toString());
|
b.field("element_type", elementType.toString());
|
||||||
}
|
}
|
||||||
|
b.field("dims", elementType == ElementType.BIT ? dims * Byte.SIZE : dims);
|
||||||
if (indexed) {
|
if (indexed) {
|
||||||
b.field("index", true);
|
b.field("index", true);
|
||||||
b.field("similarity", "l2_norm");
|
b.field("similarity", "l2_norm");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue