mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-04-19 04:45:07 -04:00
Tsdb doc values inline building jump table (#126499)
Build jump table (disi) while iterating over SortedNumericDocValues for encoding the values, instead of separately iterating over SortedNumericDocValues just to build the jump table. In case when indexing sorting is active, this requires an additional merge sort. Follow up from #125403
This commit is contained in:
parent
4248c9908c
commit
0d41e9a2a5
4 changed files with 955 additions and 143 deletions
|
@ -39,7 +39,6 @@ import org.openjdk.jmh.annotations.Param;
|
|||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.TearDown;
|
||||
import org.openjdk.jmh.annotations.Threads;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.profile.AsyncProfiler;
|
||||
|
@ -51,9 +50,8 @@ import org.openjdk.jmh.runner.options.OptionsBuilder;
|
|||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
@BenchmarkMode(Mode.SingleShotTime)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
|
@ -71,23 +69,10 @@ public class TSDBDocValuesMergeBenchmark {
|
|||
LogConfigurator.setNodeName("test");
|
||||
}
|
||||
|
||||
@Param("20431204")
|
||||
private int nDocs;
|
||||
|
||||
@Param("1000")
|
||||
private int deltaTime;
|
||||
|
||||
@Param("42")
|
||||
private int seed;
|
||||
|
||||
private static final String TIMESTAMP_FIELD = "@timestamp";
|
||||
private static final String HOSTNAME_FIELD = "host.name";
|
||||
private static final long BASE_TIMESTAMP = 1704067200000L;
|
||||
|
||||
private IndexWriter indexWriterWithoutOptimizedMerge;
|
||||
private IndexWriter indexWriterWithOptimizedMerge;
|
||||
private ExecutorService executorService;
|
||||
|
||||
public static void main(String[] args) throws RunnerException {
|
||||
final Options options = new OptionsBuilder().include(TSDBDocValuesMergeBenchmark.class.getSimpleName())
|
||||
.addProfiler(AsyncProfiler.class)
|
||||
|
@ -96,78 +81,168 @@ public class TSDBDocValuesMergeBenchmark {
|
|||
new Runner(options).run();
|
||||
}
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws IOException {
|
||||
executorService = Executors.newSingleThreadExecutor();
|
||||
@State(Scope.Benchmark)
|
||||
public static class StateDenseWithoutOptimizeMerge {
|
||||
|
||||
final Directory tempDirectoryWithoutDocValuesSkipper = FSDirectory.open(Files.createTempDirectory("temp1-"));
|
||||
final Directory tempDirectoryWithDocValuesSkipper = FSDirectory.open(Files.createTempDirectory("temp2-"));
|
||||
@Param("20431204")
|
||||
private int nDocs;
|
||||
|
||||
@Param("1000")
|
||||
private int deltaTime;
|
||||
|
||||
@Param("42")
|
||||
private int seed;
|
||||
|
||||
private Directory directory;
|
||||
private final Supplier<IndexWriterConfig> iwc = () -> createIndexWriterConfig(false);
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws IOException {
|
||||
directory = FSDirectory.open(Files.createTempDirectory("temp2-"));
|
||||
createIndex(directory, iwc.get(), false, nDocs, deltaTime, seed);
|
||||
}
|
||||
|
||||
indexWriterWithoutOptimizedMerge = createIndex(tempDirectoryWithoutDocValuesSkipper, false);
|
||||
indexWriterWithOptimizedMerge = createIndex(tempDirectoryWithDocValuesSkipper, true);
|
||||
}
|
||||
|
||||
private IndexWriter createIndex(final Directory directory, final boolean optimizedMergeEnabled) throws IOException {
|
||||
final var iwc = createIndexWriterConfig(optimizedMergeEnabled);
|
||||
@Benchmark
|
||||
public void forceMergeDenseWithoutOptimizedMerge(StateDenseWithoutOptimizeMerge state) throws IOException {
|
||||
forceMerge(state.directory, state.iwc.get());
|
||||
}
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class StateDenseWithOptimizeMerge {
|
||||
|
||||
@Param("20431204")
|
||||
private int nDocs;
|
||||
|
||||
@Param("1000")
|
||||
private int deltaTime;
|
||||
|
||||
@Param("42")
|
||||
private int seed;
|
||||
|
||||
private Directory directory;
|
||||
private final Supplier<IndexWriterConfig> iwc = () -> createIndexWriterConfig(true);
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws IOException {
|
||||
directory = FSDirectory.open(Files.createTempDirectory("temp1-"));
|
||||
createIndex(directory, iwc.get(), false, nDocs, deltaTime, seed);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void forceMergeDenseWithOptimizedMerge(StateDenseWithOptimizeMerge state) throws IOException {
|
||||
forceMerge(state.directory, state.iwc.get());
|
||||
}
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class StateSparseWithoutOptimizeMerge {
|
||||
|
||||
@Param("20431204")
|
||||
private int nDocs;
|
||||
|
||||
@Param("1000")
|
||||
private int deltaTime;
|
||||
|
||||
@Param("42")
|
||||
private int seed;
|
||||
|
||||
private Directory directory;
|
||||
private final Supplier<IndexWriterConfig> iwc = () -> createIndexWriterConfig(false);
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws IOException {
|
||||
directory = FSDirectory.open(Files.createTempDirectory("temp4-"));
|
||||
createIndex(directory, iwc.get(), true, nDocs, deltaTime, seed);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void forceMergeSparseWithoutOptimizedMerge(StateSparseWithoutOptimizeMerge state) throws IOException {
|
||||
forceMerge(state.directory, state.iwc.get());
|
||||
}
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class StateSparseWithOptimizeMerge {
|
||||
|
||||
@Param("20431204")
|
||||
private int nDocs;
|
||||
|
||||
@Param("1000")
|
||||
private int deltaTime;
|
||||
|
||||
@Param("42")
|
||||
private int seed;
|
||||
|
||||
private Directory directory;
|
||||
private final Supplier<IndexWriterConfig> iwc = () -> createIndexWriterConfig(true);
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws IOException {
|
||||
directory = FSDirectory.open(Files.createTempDirectory("temp3-"));
|
||||
createIndex(directory, iwc.get(), true, nDocs, deltaTime, seed);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void forceMergeSparseWithOptimizedMerge(StateSparseWithOptimizeMerge state) throws IOException {
|
||||
forceMerge(state.directory, state.iwc.get());
|
||||
}
|
||||
|
||||
private void forceMerge(Directory directory, IndexWriterConfig config) throws IOException {
|
||||
try (var indexWriter = new IndexWriter(directory, config)) {
|
||||
indexWriter.forceMerge(1);
|
||||
}
|
||||
}
|
||||
|
||||
static void createIndex(Directory directory, IndexWriterConfig iwc, boolean sparse, int nDocs, int deltaTime, int seed)
|
||||
throws IOException {
|
||||
long counter1 = 0;
|
||||
long counter2 = 10_000_000;
|
||||
long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 };
|
||||
long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 };
|
||||
int numHosts = 1000;
|
||||
int numHosts = 10000;
|
||||
String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" };
|
||||
|
||||
final Random random = new Random(seed);
|
||||
IndexWriter indexWriter = new IndexWriter(directory, iwc);
|
||||
for (int i = 0; i < nDocs; i++) {
|
||||
final Document doc = new Document();
|
||||
try (var indexWriter = new IndexWriter(directory, iwc)) {
|
||||
for (int i = 0; i < nDocs; i++) {
|
||||
final Document doc = new Document();
|
||||
|
||||
final int batchIndex = i / numHosts;
|
||||
final String hostName = "host-" + batchIndex;
|
||||
// Slightly vary the timestamp in each document
|
||||
final long timestamp = BASE_TIMESTAMP + ((i % numHosts) * deltaTime) + random.nextInt(0, deltaTime);
|
||||
final int batchIndex = i % numHosts;
|
||||
final String hostName = "host-" + batchIndex;
|
||||
// Slightly vary the timestamp in each document
|
||||
final long timestamp = BASE_TIMESTAMP + ((i % numHosts) * deltaTime) + random.nextInt(0, deltaTime);
|
||||
|
||||
doc.add(new SortedDocValuesField(HOSTNAME_FIELD, new BytesRef(hostName)));
|
||||
doc.add(new SortedNumericDocValuesField(TIMESTAMP_FIELD, timestamp));
|
||||
doc.add(new SortedNumericDocValuesField("counter_1", counter1++));
|
||||
doc.add(new SortedNumericDocValuesField("counter_2", counter2++));
|
||||
doc.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length]));
|
||||
doc.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length]));
|
||||
int numTags = tags.length % (i + 1);
|
||||
for (int j = 0; j < numTags; j++) {
|
||||
doc.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j])));
|
||||
}
|
||||
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
indexWriter.commit();
|
||||
return indexWriter;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void forceMergeWithoutOptimizedMerge() throws IOException {
|
||||
forceMerge(indexWriterWithoutOptimizedMerge);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void forceMergeWithOptimizedMerge() throws IOException {
|
||||
forceMerge(indexWriterWithOptimizedMerge);
|
||||
}
|
||||
|
||||
private void forceMerge(final IndexWriter indexWriter) throws IOException {
|
||||
indexWriter.forceMerge(1);
|
||||
}
|
||||
|
||||
@TearDown(Level.Trial)
|
||||
public void tearDown() {
|
||||
if (executorService != null) {
|
||||
executorService.shutdown();
|
||||
try {
|
||||
if (executorService.awaitTermination(30, TimeUnit.SECONDS) == false) {
|
||||
executorService.shutdownNow();
|
||||
doc.add(new SortedDocValuesField(HOSTNAME_FIELD, new BytesRef(hostName)));
|
||||
doc.add(new SortedNumericDocValuesField(TIMESTAMP_FIELD, timestamp));
|
||||
if (sparse == false || random.nextBoolean()) {
|
||||
doc.add(new SortedNumericDocValuesField("counter_1", counter1++));
|
||||
}
|
||||
if (sparse == false || random.nextBoolean()) {
|
||||
doc.add(new SortedNumericDocValuesField("counter_2", counter2++));
|
||||
}
|
||||
if (sparse == false || random.nextBoolean()) {
|
||||
doc.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length]));
|
||||
}
|
||||
if (sparse == false || random.nextBoolean()) {
|
||||
doc.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length]));
|
||||
}
|
||||
if (sparse == false || random.nextBoolean()) {
|
||||
int numTags = tags.length % (i + 1);
|
||||
for (int j = 0; j < numTags; j++) {
|
||||
doc.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j])));
|
||||
}
|
||||
}
|
||||
indexWriter.addDocument(doc);
|
||||
|
||||
if (i % 10000 == 0) {
|
||||
indexWriter.commit();
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
executorService.shutdownNow();
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,196 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the "Elastic License
|
||||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||
* Public License v 1"; you may not use this file except in compliance with, at
|
||||
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.codec.tsdb.es819;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BitSetIterator;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.elasticsearch.core.SuppressForbidden;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Fork of {@link org.apache.lucene.codecs.lucene90.IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput)} but that allows
|
||||
* building jump list iteratively by one docid at a time instead of relying on docidset iterator.
|
||||
*/
|
||||
final class DISIAccumulator implements Closeable {
|
||||
|
||||
private static final int BLOCK_SIZE = 65536; // The number of docIDs that a single block represents
|
||||
|
||||
private static final int DENSE_BLOCK_LONGS = BLOCK_SIZE / Long.SIZE; // 1024
|
||||
public static final byte DEFAULT_DENSE_RANK_POWER = 9; // Every 512 docIDs / 8 longs
|
||||
|
||||
static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
|
||||
|
||||
final Directory dir;
|
||||
final IOContext context;
|
||||
final String skipListTempFileName;
|
||||
final IndexOutput disiTempOutput;
|
||||
final byte denseRankPower;
|
||||
final long origo;
|
||||
|
||||
int totalCardinality = 0;
|
||||
int blockCardinality = 0;
|
||||
final FixedBitSet buffer = new FixedBitSet(1 << 16);
|
||||
int[] jumps = new int[ArrayUtil.oversize(1, Integer.BYTES * 2)];
|
||||
int prevBlock = -1;
|
||||
int jumpBlockIndex = 0;
|
||||
|
||||
DISIAccumulator(Directory dir, IOContext context, IndexOutput data, byte denseRankPower) throws IOException {
|
||||
this.dir = dir;
|
||||
this.context = context;
|
||||
this.denseRankPower = denseRankPower;
|
||||
if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) {
|
||||
throw new IllegalArgumentException(
|
||||
"Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). "
|
||||
+ "The provided power was "
|
||||
+ denseRankPower
|
||||
+ " (every "
|
||||
+ (int) Math.pow(2, denseRankPower)
|
||||
+ " docIDs)"
|
||||
);
|
||||
}
|
||||
this.disiTempOutput = dir.createTempOutput(data.getName(), "disi", context);
|
||||
this.skipListTempFileName = disiTempOutput.getName();
|
||||
this.origo = disiTempOutput.getFilePointer(); // All jumps are relative to the origo
|
||||
}
|
||||
|
||||
void addDocId(int doc) throws IOException {
|
||||
final int block = doc >>> 16;
|
||||
if (prevBlock != -1 && block != prevBlock) {
|
||||
// Track offset+index from previous block up to current
|
||||
jumps = addJumps(jumps, disiTempOutput.getFilePointer() - origo, totalCardinality, jumpBlockIndex, prevBlock + 1);
|
||||
jumpBlockIndex = prevBlock + 1;
|
||||
// Flush block
|
||||
flush(prevBlock, buffer, blockCardinality, denseRankPower, disiTempOutput);
|
||||
// Reset for next block
|
||||
buffer.clear();
|
||||
totalCardinality += blockCardinality;
|
||||
blockCardinality = 0;
|
||||
}
|
||||
buffer.set(doc & 0xFFFF);
|
||||
blockCardinality++;
|
||||
prevBlock = block;
|
||||
}
|
||||
|
||||
short build(IndexOutput data) throws IOException {
|
||||
if (blockCardinality > 0) {
|
||||
jumps = addJumps(jumps, disiTempOutput.getFilePointer() - origo, totalCardinality, jumpBlockIndex, prevBlock + 1);
|
||||
totalCardinality += blockCardinality;
|
||||
flush(prevBlock, buffer, blockCardinality, denseRankPower, disiTempOutput);
|
||||
buffer.clear();
|
||||
prevBlock++;
|
||||
}
|
||||
final int lastBlock = prevBlock == -1 ? 0 : prevBlock; // There will always be at least 1 block (NO_MORE_DOCS)
|
||||
// Last entry is a SPARSE with blockIndex == 32767 and the single entry 65535, which becomes the
|
||||
// docID NO_MORE_DOCS
|
||||
// To avoid creating 65K jump-table entries, only a single entry is created pointing to the
|
||||
// offset of the
|
||||
// NO_MORE_DOCS block, with the jumpBlockIndex set to the logical EMPTY block after all real
|
||||
// blocks.
|
||||
jumps = addJumps(jumps, disiTempOutput.getFilePointer() - origo, totalCardinality, lastBlock, lastBlock + 1);
|
||||
buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
|
||||
flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, denseRankPower, disiTempOutput);
|
||||
// offset+index jump-table stored at the end
|
||||
short blockCount = flushBlockJumps(jumps, lastBlock + 1, disiTempOutput);
|
||||
disiTempOutput.close();
|
||||
try (var addressDataInput = dir.openInput(skipListTempFileName, context)) {
|
||||
data.copyBytes(addressDataInput, addressDataInput.length());
|
||||
}
|
||||
return blockCount;
|
||||
}
|
||||
|
||||
// Adds entries to the offset & index jump-table for blocks
|
||||
private static int[] addJumps(int[] jumps, long offset, int index, int startBlock, int endBlock) {
|
||||
assert offset < Integer.MAX_VALUE : "Logically the offset should not exceed 2^30 but was >= Integer.MAX_VALUE";
|
||||
jumps = ArrayUtil.grow(jumps, (endBlock + 1) * 2);
|
||||
for (int b = startBlock; b < endBlock; b++) {
|
||||
jumps[b * 2] = index;
|
||||
jumps[b * 2 + 1] = (int) offset;
|
||||
}
|
||||
return jumps;
|
||||
}
|
||||
|
||||
private static void flush(int block, FixedBitSet buffer, int cardinality, byte denseRankPower, IndexOutput out) throws IOException {
|
||||
assert block >= 0 && block < BLOCK_SIZE;
|
||||
out.writeShort((short) block);
|
||||
assert cardinality > 0 && cardinality <= BLOCK_SIZE;
|
||||
out.writeShort((short) (cardinality - 1));
|
||||
if (cardinality > MAX_ARRAY_LENGTH) {
|
||||
if (cardinality != BLOCK_SIZE) { // all docs are set
|
||||
if (denseRankPower != -1) {
|
||||
final byte[] rank = createRank(buffer, denseRankPower);
|
||||
out.writeBytes(rank, rank.length);
|
||||
}
|
||||
for (long word : buffer.getBits()) {
|
||||
out.writeLong(word);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
BitSetIterator it = new BitSetIterator(buffer, cardinality);
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
out.writeShort((short) doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Flushes the offset & index jump-table for blocks. This should be the last data written to out
|
||||
// This method returns the blockCount for the blocks reachable for the jump_table or -1 for no
|
||||
// jump-table
|
||||
private static short flushBlockJumps(int[] jumps, int blockCount, IndexOutput out) throws IOException {
|
||||
if (blockCount == 2) { // Jumps with a single real entry + NO_MORE_DOCS is just wasted space so we ignore
|
||||
// that
|
||||
blockCount = 0;
|
||||
}
|
||||
for (int i = 0; i < blockCount; i++) {
|
||||
out.writeInt(jumps[i * 2]); // index
|
||||
out.writeInt(jumps[i * 2 + 1]); // offset
|
||||
}
|
||||
// As there are at most 32k blocks, the count is a short
|
||||
// The jumpTableOffset will be at lastPos - (blockCount * Long.BYTES)
|
||||
return (short) blockCount;
|
||||
}
|
||||
|
||||
// Creates a DENSE rank-entry (the number of set bits up to a given point) for the buffer.
|
||||
// One rank-entry for every {@code 2^denseRankPower} bits, with each rank-entry using 2 bytes.
|
||||
// Represented as a byte[] for fast flushing and mirroring of the retrieval representation.
|
||||
private static byte[] createRank(FixedBitSet buffer, byte denseRankPower) {
|
||||
final int longsPerRank = 1 << (denseRankPower - 6);
|
||||
final int rankMark = longsPerRank - 1;
|
||||
final int rankIndexShift = denseRankPower - 7; // 6 for the long (2^6) + 1 for 2 bytes/entry
|
||||
final byte[] rank = new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
|
||||
final long[] bits = buffer.getBits();
|
||||
int bitCount = 0;
|
||||
for (int word = 0; word < DENSE_BLOCK_LONGS; word++) {
|
||||
if ((word & rankMark) == 0) { // Every longsPerRank longs
|
||||
rank[word >> rankIndexShift] = (byte) (bitCount >> 8);
|
||||
rank[(word >> rankIndexShift) + 1] = (byte) (bitCount & 0xFF);
|
||||
}
|
||||
bitCount += Long.bitCount(bits[word]);
|
||||
}
|
||||
return rank;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressForbidden(reason = "require usage of Lucene's IOUtils#deleteFilesIgnoringExceptions(...)")
|
||||
public void close() throws IOException {
|
||||
IOUtils.close(disiTempOutput);
|
||||
if (skipListTempFileName != null) {
|
||||
IOUtils.deleteFilesIgnoringExceptions(dir, skipListTempFileName);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -29,6 +29,8 @@ import org.apache.lucene.search.SortedSetSelector;
|
|||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersIndexOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -54,6 +56,8 @@ import static org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat.
|
|||
|
||||
final class ES819TSDBDocValuesConsumer extends XDocValuesConsumer {
|
||||
|
||||
final Directory dir;
|
||||
final IOContext context;
|
||||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
private byte[] termsDictBuffer;
|
||||
|
@ -70,6 +74,8 @@ final class ES819TSDBDocValuesConsumer extends XDocValuesConsumer {
|
|||
String metaExtension
|
||||
) throws IOException {
|
||||
this.termsDictBuffer = new byte[1 << 14];
|
||||
this.dir = state.directory;
|
||||
this.context = state.context;
|
||||
boolean success = false;
|
||||
try {
|
||||
final String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
|
@ -138,84 +144,101 @@ final class ES819TSDBDocValuesConsumer extends XDocValuesConsumer {
|
|||
meta.writeLong(numValues);
|
||||
meta.writeInt(numDocsWithValue);
|
||||
|
||||
if (numValues > 0) {
|
||||
// Special case for maxOrd of 1, signal -1 that no blocks will be written
|
||||
meta.writeInt(maxOrd != 1 ? ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT : -1);
|
||||
final ByteBuffersDataOutput indexOut = new ByteBuffersDataOutput();
|
||||
final DirectMonotonicWriter indexWriter = DirectMonotonicWriter.getInstance(
|
||||
meta,
|
||||
new ByteBuffersIndexOutput(indexOut, "temp-dv-index", "temp-dv-index"),
|
||||
1L + ((numValues - 1) >>> ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT),
|
||||
ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT
|
||||
);
|
||||
DISIAccumulator disiAccumulator = null;
|
||||
try {
|
||||
if (numValues > 0) {
|
||||
assert numDocsWithValue > 0;
|
||||
// Special case for maxOrd of 1, signal -1 that no blocks will be written
|
||||
meta.writeInt(maxOrd != 1 ? ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT : -1);
|
||||
final ByteBuffersDataOutput indexOut = new ByteBuffersDataOutput();
|
||||
final DirectMonotonicWriter indexWriter = DirectMonotonicWriter.getInstance(
|
||||
meta,
|
||||
new ByteBuffersIndexOutput(indexOut, "temp-dv-index", "temp-dv-index"),
|
||||
1L + ((numValues - 1) >>> ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT),
|
||||
ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT
|
||||
);
|
||||
|
||||
final long valuesDataOffset = data.getFilePointer();
|
||||
// Special case for maxOrd of 1, skip writing the blocks
|
||||
if (maxOrd != 1) {
|
||||
final long[] buffer = new long[ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];
|
||||
int bufferSize = 0;
|
||||
final TSDBDocValuesEncoder encoder = new TSDBDocValuesEncoder(ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
|
||||
values = valuesProducer.getSortedNumeric(field);
|
||||
final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1;
|
||||
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
final int count = values.docValueCount();
|
||||
for (int i = 0; i < count; ++i) {
|
||||
buffer[bufferSize++] = values.nextValue();
|
||||
if (bufferSize == ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE) {
|
||||
indexWriter.add(data.getFilePointer() - valuesDataOffset);
|
||||
if (maxOrd >= 0) {
|
||||
encoder.encodeOrdinals(buffer, data, bitsPerOrd);
|
||||
} else {
|
||||
encoder.encode(buffer, data);
|
||||
final long valuesDataOffset = data.getFilePointer();
|
||||
// Special case for maxOrd of 1, skip writing the blocks
|
||||
if (maxOrd != 1) {
|
||||
final long[] buffer = new long[ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];
|
||||
int bufferSize = 0;
|
||||
final TSDBDocValuesEncoder encoder = new TSDBDocValuesEncoder(ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
|
||||
values = valuesProducer.getSortedNumeric(field);
|
||||
final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1;
|
||||
if (enableOptimizedMerge && numDocsWithValue < maxDoc) {
|
||||
disiAccumulator = new DISIAccumulator(dir, context, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
}
|
||||
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
if (disiAccumulator != null) {
|
||||
disiAccumulator.addDocId(doc);
|
||||
}
|
||||
final int count = values.docValueCount();
|
||||
for (int i = 0; i < count; ++i) {
|
||||
buffer[bufferSize++] = values.nextValue();
|
||||
if (bufferSize == ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE) {
|
||||
indexWriter.add(data.getFilePointer() - valuesDataOffset);
|
||||
if (maxOrd >= 0) {
|
||||
encoder.encodeOrdinals(buffer, data, bitsPerOrd);
|
||||
} else {
|
||||
encoder.encode(buffer, data);
|
||||
}
|
||||
bufferSize = 0;
|
||||
}
|
||||
bufferSize = 0;
|
||||
}
|
||||
}
|
||||
if (bufferSize > 0) {
|
||||
indexWriter.add(data.getFilePointer() - valuesDataOffset);
|
||||
// Fill unused slots in the block with zeroes rather than junk
|
||||
Arrays.fill(buffer, bufferSize, ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE, 0L);
|
||||
if (maxOrd >= 0) {
|
||||
encoder.encodeOrdinals(buffer, data, bitsPerOrd);
|
||||
} else {
|
||||
encoder.encode(buffer, data);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bufferSize > 0) {
|
||||
indexWriter.add(data.getFilePointer() - valuesDataOffset);
|
||||
// Fill unused slots in the block with zeroes rather than junk
|
||||
Arrays.fill(buffer, bufferSize, ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE, 0L);
|
||||
if (maxOrd >= 0) {
|
||||
encoder.encodeOrdinals(buffer, data, bitsPerOrd);
|
||||
} else {
|
||||
encoder.encode(buffer, data);
|
||||
}
|
||||
|
||||
final long valuesDataLength = data.getFilePointer() - valuesDataOffset;
|
||||
if (maxOrd != 1) {
|
||||
// Special case for maxOrd of 1, indexWriter isn't really used, so no need to invoke finish() method.
|
||||
indexWriter.finish();
|
||||
}
|
||||
final long indexDataOffset = data.getFilePointer();
|
||||
data.copyBytes(indexOut.toDataInput(), indexOut.size());
|
||||
meta.writeLong(indexDataOffset);
|
||||
meta.writeLong(data.getFilePointer() - indexDataOffset);
|
||||
|
||||
meta.writeLong(valuesDataOffset);
|
||||
meta.writeLong(valuesDataLength);
|
||||
}
|
||||
|
||||
final long valuesDataLength = data.getFilePointer() - valuesDataOffset;
|
||||
if (maxOrd != 1) {
|
||||
// Special case for maxOrd of 1, indexWriter isn't really used, so no need to invoke finish() method.
|
||||
indexWriter.finish();
|
||||
if (numDocsWithValue == 0) { // meta[-2, 0]: No documents with values
|
||||
meta.writeLong(-2); // docsWithFieldOffset
|
||||
meta.writeLong(0L); // docsWithFieldLength
|
||||
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||
meta.writeByte((byte) -1); // denseRankPower
|
||||
} else if (numDocsWithValue == maxDoc) { // meta[-1, 0]: All documents have values
|
||||
meta.writeLong(-1); // docsWithFieldOffset
|
||||
meta.writeLong(0L); // docsWithFieldLength
|
||||
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||
meta.writeByte((byte) -1); // denseRankPower
|
||||
} else { // meta[data.offset, data.length]: IndexedDISI structure for documents with values
|
||||
long offset = data.getFilePointer();
|
||||
meta.writeLong(offset); // docsWithFieldOffset
|
||||
final short jumpTableEntryCount;
|
||||
if (maxOrd != 1 && disiAccumulator != null) {
|
||||
jumpTableEntryCount = disiAccumulator.build(data);
|
||||
} else {
|
||||
values = valuesProducer.getSortedNumeric(field);
|
||||
jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
}
|
||||
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||
meta.writeShort(jumpTableEntryCount);
|
||||
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
}
|
||||
final long indexDataOffset = data.getFilePointer();
|
||||
data.copyBytes(indexOut.toDataInput(), indexOut.size());
|
||||
meta.writeLong(indexDataOffset);
|
||||
meta.writeLong(data.getFilePointer() - indexDataOffset);
|
||||
|
||||
meta.writeLong(valuesDataOffset);
|
||||
meta.writeLong(valuesDataLength);
|
||||
}
|
||||
|
||||
if (numDocsWithValue == 0) { // meta[-2, 0]: No documents with values
|
||||
meta.writeLong(-2); // docsWithFieldOffset
|
||||
meta.writeLong(0L); // docsWithFieldLength
|
||||
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||
meta.writeByte((byte) -1); // denseRankPower
|
||||
} else if (numDocsWithValue == maxDoc) { // meta[-1, 0]: All documents have values
|
||||
meta.writeLong(-1); // docsWithFieldOffset
|
||||
meta.writeLong(0L); // docsWithFieldLength
|
||||
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||
meta.writeByte((byte) -1); // denseRankPower
|
||||
} else { // meta[data.offset, data.length]: IndexedDISI structure for documents with values
|
||||
long offset = data.getFilePointer();
|
||||
meta.writeLong(offset); // docsWithFieldOffset
|
||||
values = valuesProducer.getSortedNumeric(field);
|
||||
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||
meta.writeShort(jumpTableEntryCount);
|
||||
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
} finally {
|
||||
IOUtils.close(disiAccumulator);
|
||||
}
|
||||
|
||||
return new long[] { numDocsWithValue, numValues };
|
||||
|
|
|
@ -0,0 +1,518 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the "Elastic License
|
||||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||
* Public License v 1"; you may not use this file except in compliance with, at
|
||||
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||
*/
|
||||
package org.elasticsearch.index.codec.tsdb.es819;
|
||||
|
||||
import org.apache.lucene.codecs.lucene90.IndexedDISI;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.BitSet;
|
||||
import org.apache.lucene.util.BitSetIterator;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
import org.elasticsearch.core.SuppressForbidden;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
// Copied from org.apache.lucene.codecs.lucene90.TestIndexedDISI and kept tests that we can run.
|
||||
// The test suite has been modified to write jump table using writeJumpTable(...) in this class.
|
||||
// (some original tests require access to package protected constructor of IndexedDISI and was removed)
|
||||
public class DISIAccumulatorTests extends LuceneTestCase {
|
||||
|
||||
public void testEmpty() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
BitSet set = new SparseFixedBitSet(maxDoc);
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
// EMPTY blocks are special with regard to jumps as they have size 0
|
||||
public void testEmptyBlocks() throws IOException {
|
||||
final int B = 65536;
|
||||
int maxDoc = B * 11;
|
||||
BitSet set = new SparseFixedBitSet(maxDoc);
|
||||
// block 0: EMPTY
|
||||
set.set(B + 5); // block 1: SPARSE
|
||||
// block 2: EMPTY
|
||||
// block 3: EMPTY
|
||||
set.set(B * 4 + 5); // block 4: SPARSE
|
||||
|
||||
for (int i = 0; i < B; i++) {
|
||||
set.set(B * 6 + i); // block 6: ALL
|
||||
}
|
||||
for (int i = 0; i < B; i += 3) {
|
||||
set.set(B * 7 + i); // block 7: DENSE
|
||||
}
|
||||
for (int i = 0; i < B; i++) {
|
||||
if (i != 32768) {
|
||||
set.set(B * 8 + i); // block 8: DENSE (all-1)
|
||||
}
|
||||
}
|
||||
// block 9-11: EMPTY
|
||||
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTestAllSingleJump(set, dir);
|
||||
}
|
||||
|
||||
// Change the first block to DENSE to see if jump-tables sets to position 0
|
||||
set.set(0);
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTestAllSingleJump(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
// EMPTY blocks are special with regard to jumps as they have size 0
|
||||
public void testLastEmptyBlocks() throws IOException {
|
||||
final int B = 65536;
|
||||
int maxDoc = B * 3;
|
||||
BitSet set = new SparseFixedBitSet(maxDoc);
|
||||
for (int docID = 0; docID < B * 2; docID++) { // first 2 blocks are ALL
|
||||
set.set(docID);
|
||||
}
|
||||
// Last block is EMPTY
|
||||
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTestAllSingleJump(set, dir);
|
||||
assertAdvanceBeyondEnd(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that advance after the end of the blocks has been reached has the correct behaviour
|
||||
private void assertAdvanceBeyondEnd(BitSet set, Directory dir) throws IOException {
|
||||
final int cardinality = set.cardinality();
|
||||
final byte denseRankPower = 9; // Not tested here so fixed to isolate factors
|
||||
int jumpTableentryCount;
|
||||
try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
|
||||
jumpTableentryCount = writeJumpTable(set, dir, out, denseRankPower);
|
||||
}
|
||||
|
||||
try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
int doc = disi2.docID();
|
||||
int index = 0;
|
||||
while (doc < cardinality) {
|
||||
doc = disi2.nextDoc();
|
||||
index++;
|
||||
}
|
||||
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, in.length(), jumpTableentryCount, denseRankPower, cardinality);
|
||||
// Advance 1 docID beyond end
|
||||
assertFalse("There should be no set bit beyond the valid docID range", disi.advanceExact(set.length()));
|
||||
disi.advance(doc); // Should be the special docID signifyin NO_MORE_DOCS from the BitSetIterator
|
||||
// disi.index()+1 as the while-loop also counts the NO_MORE_DOCS
|
||||
assertEquals("The index when advancing beyond the last defined docID should be correct", index, disi.index() + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: can this be toned down?
|
||||
public void testRandomBlocks() throws IOException {
|
||||
final int BLOCKS = 5;
|
||||
BitSet set = createSetWithRandomBlocks(BLOCKS);
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTestAllSingleJump(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
private BitSet createSetWithRandomBlocks(int blockCount) {
|
||||
final int B = 65536;
|
||||
BitSet set = new SparseFixedBitSet(blockCount * B);
|
||||
for (int block = 0; block < blockCount; block++) {
|
||||
switch (random().nextInt(4)) {
|
||||
case 0: { // EMPTY
|
||||
break;
|
||||
}
|
||||
case 1: { // ALL
|
||||
for (int docID = block * B; docID < (block + 1) * B; docID++) {
|
||||
set.set(docID);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 2: { // SPARSE ( < 4096 )
|
||||
for (int docID = block * B; docID < (block + 1) * B; docID += 101) {
|
||||
set.set(docID);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 3: { // DENSE ( >= 4096 )
|
||||
for (int docID = block * B; docID < (block + 1) * B; docID += 3) {
|
||||
set.set(docID);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new IllegalStateException("Modulo logic error: there should only be 4 possibilities");
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private void doTestAllSingleJump(BitSet set, Directory dir) throws IOException {
|
||||
final int cardinality = set.cardinality();
|
||||
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7) + 7); // sane + chance of disable
|
||||
long length;
|
||||
int jumpTableentryCount;
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
jumpTableentryCount = writeJumpTable(set, dir, out, denseRankPower);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
for (int i = 0; i < set.length(); i++) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||
assertEquals("The bit at " + i + " should be correct with advanceExact", set.get(i), disi.advanceExact(i));
|
||||
|
||||
IndexedDISI disi2 = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||
disi2.advance(i);
|
||||
// Proper sanity check with jump tables as an error could make them seek backwards
|
||||
assertTrue("The docID should at least be " + i + " after advance(" + i + ") but was " + disi2.docID(), i <= disi2.docID());
|
||||
if (set.get(i)) {
|
||||
assertEquals("The docID should be present with advance", i, disi2.docID());
|
||||
} else {
|
||||
assertNotSame("The docID should not be present with advance", i, disi2.docID());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testOneDoc() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
BitSet set = new SparseFixedBitSet(maxDoc);
|
||||
set.set(random().nextInt(maxDoc));
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testTwoDocs() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
BitSet set = new SparseFixedBitSet(maxDoc);
|
||||
set.set(random().nextInt(maxDoc));
|
||||
set.set(random().nextInt(maxDoc));
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testAllDocs() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(1, maxDoc);
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testHalfFull() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
BitSet set = new SparseFixedBitSet(maxDoc);
|
||||
for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
|
||||
set.set(i);
|
||||
}
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testDocRange() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
final int start = random().nextInt(maxDoc);
|
||||
final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
|
||||
set.set(start, end);
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSparseDenseBoundary() throws IOException, NoSuchFieldException, IllegalAccessException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
FixedBitSet set = new FixedBitSet(200000);
|
||||
int start = 65536 + random().nextInt(100);
|
||||
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7) + 7); // sane + chance of disable
|
||||
|
||||
// we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
|
||||
set.set(start, start + DISIAccumulator.MAX_ARRAY_LENGTH);
|
||||
long length;
|
||||
int jumpTableEntryCount;
|
||||
try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
|
||||
jumpTableEntryCount = writeJumpTable(set, DISIAccumulator.MAX_ARRAY_LENGTH, dir, out, denseRankPower);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPower, DISIAccumulator.MAX_ARRAY_LENGTH);
|
||||
assertEquals(start, disi.nextDoc());
|
||||
if (System.getSecurityManager() == null) {
|
||||
assertEquals("SPARSE", getMethodFromDISI(disi));
|
||||
}
|
||||
}
|
||||
doTest(set, dir);
|
||||
|
||||
// now we set one more bit so the encoding will be dense
|
||||
set.set(start + DISIAccumulator.MAX_ARRAY_LENGTH + random().nextInt(100));
|
||||
try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
|
||||
writeJumpTable(set, DISIAccumulator.MAX_ARRAY_LENGTH, dir, out, denseRankPower);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(
|
||||
in,
|
||||
0L,
|
||||
length,
|
||||
jumpTableEntryCount,
|
||||
denseRankPower,
|
||||
DISIAccumulator.MAX_ARRAY_LENGTH + 1
|
||||
);
|
||||
assertEquals(start, disi.nextDoc());
|
||||
if (System.getSecurityManager() == null) {
|
||||
assertEquals("DENSE", getMethodFromDISI(disi));
|
||||
}
|
||||
}
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressForbidden(reason = "access violation required in order to read private field for this test")
|
||||
private static String getMethodFromDISI(Object o) throws NoSuchFieldException, IllegalAccessException {
|
||||
var field = IndexedDISI.class.getDeclaredField("method");
|
||||
field.setAccessible(true);
|
||||
return field.get(o).toString();
|
||||
}
|
||||
|
||||
public void testOneDocMissing() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(0, maxDoc);
|
||||
set.clear(random().nextInt(maxDoc));
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testFewMissingDocs() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
int numIters = atLeast(10);
|
||||
for (int iter = 0; iter < numIters; ++iter) {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(0, maxDoc);
|
||||
final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
|
||||
for (int i = 0; i < numMissingDocs; ++i) {
|
||||
set.clear(random().nextInt(maxDoc));
|
||||
}
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testDenseMultiBlock() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
int maxDoc = 10 * 65536; // 10 blocks
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
for (int i = 0; i < maxDoc; i += 2) { // Set every other to ensure dense
|
||||
set.set(i);
|
||||
}
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testIllegalDenseRankPower() throws IOException {
|
||||
|
||||
// Legal values
|
||||
for (byte denseRankPower : new byte[] { -1, 7, 8, 9, 10, 11, 12, 13, 14, 15 }) {
|
||||
createAndOpenDISI(denseRankPower, denseRankPower);
|
||||
}
|
||||
|
||||
// Illegal values
|
||||
for (byte denseRankPower : new byte[] { -2, 0, 1, 6, 16 }) {
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
createAndOpenDISI(denseRankPower, (byte) 8); // Illegal write, legal read (should not reach read)
|
||||
});
|
||||
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
createAndOpenDISI((byte) 8, denseRankPower); // Legal write, illegal read (should reach read)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private void createAndOpenDISI(byte denseRankPowerWrite, byte denseRankPowerRead) throws IOException {
|
||||
BitSet set = new FixedBitSet(10);
|
||||
set.set(set.length() - 1);
|
||||
try (Directory dir = newDirectory()) {
|
||||
long length;
|
||||
int jumpTableEntryCount = -1;
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
jumpTableEntryCount = writeJumpTable(set, dir, out, denseRankPowerWrite);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPowerRead, set.cardinality());
|
||||
}
|
||||
// This tests the legality of the denseRankPower only, so we don't do anything with the disi
|
||||
}
|
||||
}
|
||||
|
||||
public void testOneDocMissingFixed() throws IOException {
|
||||
int maxDoc = 9699;
|
||||
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7) + 7); // sane + chance of disable
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(0, maxDoc);
|
||||
set.clear(1345);
|
||||
try (Directory dir = newDirectory()) {
|
||||
|
||||
final int cardinality = set.cardinality();
|
||||
long length;
|
||||
int jumpTableentryCount;
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
jumpTableentryCount = writeJumpTable(set, dir, out, denseRankPower);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
|
||||
int step = 16000;
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
assertAdvanceEquality(disi, disi2, step);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
int numIters = atLeast(3);
|
||||
for (int i = 0; i < numIters; ++i) {
|
||||
doTestRandom(dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestRandom(Directory dir) throws IOException {
|
||||
Random random = random();
|
||||
final int maxStep = TestUtil.nextInt(random, 1, 1 << TestUtil.nextInt(random, 2, 20));
|
||||
final int numDocs = TestUtil.nextInt(random, 1, Math.min(100000, (Integer.MAX_VALUE - 1) / maxStep));
|
||||
BitSet docs = new SparseFixedBitSet(numDocs * maxStep + 1);
|
||||
int lastDoc = -1;
|
||||
for (int doc = -1, i = 0; i < numDocs; ++i) {
|
||||
doc += TestUtil.nextInt(random, 1, maxStep);
|
||||
docs.set(doc);
|
||||
lastDoc = doc;
|
||||
}
|
||||
final int maxDoc = lastDoc + TestUtil.nextInt(random, 1, 100);
|
||||
|
||||
BitSet set = BitSet.of(new BitSetIterator(docs, docs.approximateCardinality()), maxDoc);
|
||||
doTest(set, dir);
|
||||
}
|
||||
|
||||
private void doTest(BitSet set, Directory dir) throws IOException {
|
||||
final int cardinality = set.cardinality();
|
||||
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7) + 7); // sane + chance of disable
|
||||
long length;
|
||||
int jumpTableentryCount;
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
jumpTableentryCount = writeJumpTable(set, dir, out, denseRankPower);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
assertSingleStepEquality(disi, disi2);
|
||||
}
|
||||
|
||||
for (int step : new int[] { 1, 10, 100, 1000, 10000, 100000 }) {
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
assertAdvanceEquality(disi, disi2, step);
|
||||
}
|
||||
}
|
||||
|
||||
for (int step : new int[] { 10, 100, 1000, 10000, 100000 }) {
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
int disi2length = set.length();
|
||||
assertAdvanceExactRandomized(disi, disi2, disi2length, step);
|
||||
}
|
||||
}
|
||||
|
||||
dir.deleteFile("foo");
|
||||
}
|
||||
|
||||
private void assertAdvanceExactRandomized(IndexedDISI disi, BitSetIterator disi2, int disi2length, int step) throws IOException {
|
||||
int index = -1;
|
||||
Random random = random();
|
||||
for (int target = 0; target < disi2length;) {
|
||||
target += TestUtil.nextInt(random, 0, step);
|
||||
int doc = disi2.docID();
|
||||
while (doc < target) {
|
||||
doc = disi2.nextDoc();
|
||||
index++;
|
||||
}
|
||||
|
||||
boolean exists = disi.advanceExact(target);
|
||||
assertEquals(doc == target, exists);
|
||||
if (exists) {
|
||||
assertEquals(index, disi.index());
|
||||
} else if (random.nextBoolean()) {
|
||||
assertEquals(doc, disi.nextDoc());
|
||||
// This is a bit strange when doc == NO_MORE_DOCS as the index overcounts in the disi2
|
||||
// while-loop
|
||||
assertEquals(index, disi.index());
|
||||
target = doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void assertSingleStepEquality(IndexedDISI disi, BitSetIterator disi2) throws IOException {
|
||||
int i = 0;
|
||||
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
|
||||
assertEquals(doc, disi.nextDoc());
|
||||
assertEquals(i++, disi.index());
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
|
||||
}
|
||||
|
||||
private void assertAdvanceEquality(IndexedDISI disi, BitSetIterator disi2, int step) throws IOException {
|
||||
int index = -1;
|
||||
while (true) {
|
||||
int target = disi2.docID() + step;
|
||||
int doc;
|
||||
do {
|
||||
doc = disi2.nextDoc();
|
||||
index++;
|
||||
} while (doc < target);
|
||||
assertEquals(doc, disi.advance(target));
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
assertEquals("Expected equality using step " + step + " at docID " + doc, index, disi.index());
|
||||
}
|
||||
}
|
||||
|
||||
private static short writeJumpTable(BitSet set, Directory dir, IndexOutput out, byte denseRankPower) throws IOException {
|
||||
return writeJumpTable(set, set.cardinality(), dir, out, denseRankPower);
|
||||
}
|
||||
|
||||
private static short writeJumpTable(BitSet set, long cost, Directory dir, IndexOutput out, byte denseRankPower) throws IOException {
|
||||
var disiAccumulator = new DISIAccumulator(dir, IOContext.DEFAULT, out, denseRankPower);
|
||||
var iterator = new BitSetIterator(set, cost);
|
||||
for (int docId = iterator.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = iterator.nextDoc()) {
|
||||
disiAccumulator.addDocId(docId);
|
||||
}
|
||||
return disiAccumulator.build(out);
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue