From 155c0da00a4c1025d83e3e7a9faa0e39aca5c702 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Fri, 6 Jun 2025 12:07:32 -0400 Subject: [PATCH] Vector test tools (#128934) This adds some testing tools for verifying vector recall and latency directly without having to spin up an entire ES node and running a rally track. Its pretty barebones and takes inspiration from lucene-util, but I wanted access to our own formats and tooling to make our lives easier. Here is an example config file. This will build the initial index, run queries at num_candidates: 50, then again at num_candidates 100 (without reindexing, and re-using the cached nearest neighbors). ``` [{ "doc_vectors" : "path", "query_vectors" : "path", "num_docs" : 10000, "num_queries" : 10, "index_type" : "hnsw", "num_candidates" : 50, "k" : 10, "hnsw_m" : 16, "hnsw_ef_construction" : 200, "index_threads" : 4, "reindex" : true, "force_merge" : false, "vector_space" : "maximum_inner_product", "dimensions" : 768 }, { "doc_vectors" : "path", "query_vectors" : "path", "num_docs" : 10000, "num_queries" : 10, "index_type" : "hnsw", "num_candidates" : 100, "k" : 10, "hnsw_m" : 16, "hnsw_ef_construction" : 200, "vector_space" : "maximum_inner_product", "dimensions" : 768 } ] ``` To execute: ``` ./gradlew :qa:vector:checkVec --args="/Path/to/knn_tester_config.json" ``` Calling `./gradlew :qa:vector:checkVecHelp` gives some guidance on how to use it, additionally providing a way to run it via java directly (useful to bypass gradlew guff). --- qa/vector/build.gradle | 101 ++++ qa/vector/licenses/lucene-LICENSE.txt | 475 +++++++++++++++++ qa/vector/licenses/lucene-NOTICE.txt | 192 +++++++ qa/vector/src/main/java/module-info.java | 20 + .../elasticsearch/test/knn/CmdLineArgs.java | 292 +++++++++++ .../test/knn/KnnIndexTester.java | 399 ++++++++++++++ .../elasticsearch/test/knn/KnnIndexer.java | 329 ++++++++++++ .../elasticsearch/test/knn/KnnSearcher.java | 488 ++++++++++++++++++ server/src/main/java/module-info.java | 2 + .../index/codec/vectors/IVFVectorsReader.java | 1 + .../index/codec/vectors/IVFVectorsWriter.java | 1 + .../ES816BinaryQuantizedVectorsReader.java | 1 + .../DirectIOLucene99FlatVectorsReader.java | 1 + .../ES818BinaryQuantizedVectorsReader.java | 1 + .../ES818BinaryQuantizedVectorsWriter.java | 1 + settings.gradle | 2 + test/external-modules/build.gradle | 12 +- 17 files changed, 2312 insertions(+), 6 deletions(-) create mode 100644 qa/vector/build.gradle create mode 100644 qa/vector/licenses/lucene-LICENSE.txt create mode 100644 qa/vector/licenses/lucene-NOTICE.txt create mode 100644 qa/vector/src/main/java/module-info.java create mode 100644 qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java create mode 100644 qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java create mode 100644 qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java create mode 100644 qa/vector/src/main/java/org/elasticsearch/test/knn/KnnSearcher.java diff --git a/qa/vector/build.gradle b/qa/vector/build.gradle new file mode 100644 index 000000000000..52b2eeae8226 --- /dev/null +++ b/qa/vector/build.gradle @@ -0,0 +1,101 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +apply plugin: 'elasticsearch.java' +apply plugin: 'elasticsearch.build' + + +tasks.named("dependencyLicenses").configure { + mapping from: /lucene-.*/, to: 'lucene' +} + +tasks.named('forbiddenApisMain').configure { + enabled = false +} + +dependencies { + api "org.apache.lucene:lucene-core:${versions.lucene}" + api "org.apache.lucene:lucene-queries:${versions.lucene}" + api "org.apache.lucene:lucene-codecs:${versions.lucene}" + implementation project(':libs:logging') + implementation project(':server') +} +/** + * Task to run the KnnIndexTester with the provided parameters. + */ +tasks.register("checkVec", JavaExec) { + group = "Execution" + description = "Runs KnnIndexTester with the provided parameters to validate recall and performance." + classpath = sourceSets.main.runtimeClasspath + mainClass.set("org.elasticsearch.test.knn.KnnIndexTester") + // Configure logging to console + systemProperty "es.logger.out", "console" + systemProperty "es.logger.level", "INFO" // Change to DEBUG if needed + + if (buildParams.getRuntimeJavaVersion().map { it.majorVersion.toInteger() }.get() >= 21) { + jvmArgs '-Xms4g', '-Xmx4g', '--add-modules=jdk.incubator.vector', '--enable-native-access=ALL-UNNAMED', '-Djava.util.concurrent.ForkJoinPool.common.parallelism=8', '-XX:+UnlockDiagnosticVMOptions', '-XX:+DebugNonSafepoints', '-XX:+HeapDumpOnOutOfMemoryError' + } +} + +tasks.register("checkVecHelp", JavaExec) { + group = "Help" + description = "Prints help for the KnnIndexTester task." + classpath = sourceSets.main.runtimeClasspath + mainClass.set("org.elasticsearch.test.knn.KnnIndexTester") + args = ["--help"] + doLast { + println """ + ============================================================================= + KnnIndexTester Help + ============================================================================= + + Run with Gradle: + ---------------- + # Using default configuration file + ./gradlew :qa:vector:checkVec + + # Using custom configuration file + ./gradlew :qa:vector:checkVec --args="path/to/your/config.json" + + # Adjust heap size + ./gradlew :qa:vector:checkVec -Dorg.gradle.jvmargs="-Xmx8g" --args="path/to/your/config.json" + + # Set environment variable for more extensive JVM options + export GRADLE_OPTS="-Xmx8g -XX:+UseG1GC -XX:MaxGCPauseMillis=100" + ./gradlew :qa:vector:checkVec + + + Run directly with Java: + ---------------------- + # Generate classpath (run once to create the file) + ./gradlew :qa:vector:printClasspath + + # Then use the classpath file with java + java -cp "\$(cat build/vector_classpath.txt)" \\ + --add-modules=jdk.incubator.vector \\ + --enable-native-access=ALL-UNNAMED \\ + -Djava.util.concurrent.ForkJoinPool.common.parallelism=8 \\ + -Xmx4g \\ + -Xms4g \\\\ + org.elasticsearch.test.knn.KnnIndexTester path/to/your/config.json + """ + } +} + +tasks.register("printClasspath") { + group = "Help" + description = "Prints the classpath needed to run KnnIndexTester directly with java" + + doLast { + def classpathFile = new File("${buildDir}/vector_classpath.txt") + classpathFile.parentFile.mkdirs() + classpathFile.text = sourceSets.main.runtimeClasspath.asPath + println "Classpath written to: ${classpathFile.absolutePath}" + } +} diff --git a/qa/vector/licenses/lucene-LICENSE.txt b/qa/vector/licenses/lucene-LICENSE.txt new file mode 100644 index 000000000000..28b134f5f8e4 --- /dev/null +++ b/qa/vector/licenses/lucene-LICENSE.txt @@ -0,0 +1,475 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + +Some code in core/src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: + +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from ICU (http://www.icu-project.org) +The full license is available here: + http://source.icu-project.org/repos/icu/icu/trunk/license.html + +/* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +The following license applies to the Snowball stemmers: + +Copyright (c) 2001, Dr Martin Porter +Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the KStemmer: + +Copyright © 2003, +Center for Intelligent Information Retrieval, +University of Massachusetts, Amherst. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. The names "Center for Intelligent Information Retrieval" and +"University of Massachusetts" must not be used to endorse or promote products +derived from this software without prior written permission. To obtain +permission, contact info@ciir.cs.umass.edu. + +THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, + Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/qa/vector/licenses/lucene-NOTICE.txt b/qa/vector/licenses/lucene-NOTICE.txt new file mode 100644 index 000000000000..1a1d51572432 --- /dev/null +++ b/qa/vector/licenses/lucene-NOTICE.txt @@ -0,0 +1,192 @@ +Apache Lucene +Copyright 2014 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Ant + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +JUnit (junit-4.10) is licensed under the Common Public License v. 1.0 +See http://junit.sourceforge.net/cpl-v10.html + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + http://snowball.tartarus.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See http://project.carrot2.org/license.html. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/). + +Morfologik uses data from Polish ispell/myspell dictionary +(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia) +LGPL and Creative Commons ShareAlike. + +Morfologic includes data from BSD-licensed dictionary of Polish (SGJP) +(http://sgjp.pl/morfeusz/) + +Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original +source code for this can be found at http://www.eclipse.org/jetty/downloads.php + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. diff --git a/qa/vector/src/main/java/module-info.java b/qa/vector/src/main/java/module-info.java new file mode 100644 index 000000000000..9b9f58555968 --- /dev/null +++ b/qa/vector/src/main/java/module-info.java @@ -0,0 +1,20 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +module org.elasticsearch.test.knn { + requires org.elasticsearch.base; + requires org.elasticsearch.server; + requires org.elasticsearch.xcontent; + requires org.apache.lucene.core; + requires org.apache.lucene.codecs; + requires org.apache.lucene.queries; + requires org.elasticsearch.logging; + requires java.management; + requires jdk.management; +} diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java new file mode 100644 index 000000000000..a5f66ce5ea83 --- /dev/null +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java @@ -0,0 +1,292 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.test.knn; + +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.elasticsearch.common.Strings; +import org.elasticsearch.core.PathUtils; +import org.elasticsearch.xcontent.ObjectParser; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContentObject; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Locale; + +/** + * Command line arguments for the KNN index tester. + * This class encapsulates all the parameters required to run the KNN index tests. + */ +record CmdLineArgs( + Path docVectors, + Path queryVectors, + int numDocs, + int numQueries, + KnnIndexTester.IndexType indexType, + int numCandidates, + int k, + int nProbe, + int ivfClusterSize, + int overSamplingFactor, + int hnswM, + int hnswEfConstruction, + int searchThreads, + int indexThreads, + boolean reindex, + boolean forceMerge, + VectorSimilarityFunction vectorSpace, + int quantizeBits, + VectorEncoding vectorEncoding, + int dimensions +) implements ToXContentObject { + + static final ParseField DOC_VECTORS_FIELD = new ParseField("doc_vectors"); + static final ParseField QUERY_VECTORS_FIELD = new ParseField("query_vectors"); + static final ParseField NUM_DOCS_FIELD = new ParseField("num_docs"); + static final ParseField NUM_QUERIES_FIELD = new ParseField("num_queries"); + static final ParseField INDEX_TYPE_FIELD = new ParseField("index_type"); + static final ParseField NUM_CANDIDATES_FIELD = new ParseField("num_candidates"); + static final ParseField K_FIELD = new ParseField("k"); + static final ParseField N_PROBE_FIELD = new ParseField("n_probe"); + static final ParseField IVF_CLUSTER_SIZE_FIELD = new ParseField("ivf_cluster_size"); + static final ParseField OVER_SAMPLING_FACTOR_FIELD = new ParseField("over_sampling_factor"); + static final ParseField HNSW_M_FIELD = new ParseField("hnsw_m"); + static final ParseField HNSW_EF_CONSTRUCTION_FIELD = new ParseField("hnsw_ef_construction"); + static final ParseField SEARCH_THREADS_FIELD = new ParseField("search_threads"); + static final ParseField INDEX_THREADS_FIELD = new ParseField("index_threads"); + static final ParseField REINDEX_FIELD = new ParseField("reindex"); + static final ParseField FORCE_MERGE_FIELD = new ParseField("force_merge"); + static final ParseField VECTOR_SPACE_FIELD = new ParseField("vector_space"); + static final ParseField QUANTIZE_BITS_FIELD = new ParseField("quantize_bits"); + static final ParseField VECTOR_ENCODING_FIELD = new ParseField("vector_encoding"); + static final ParseField DIMENSIONS_FIELD = new ParseField("dimensions"); + + static CmdLineArgs fromXContent(XContentParser parser) throws IOException { + Builder builder = PARSER.apply(parser, null); + return builder.build(); + } + + static final ObjectParser PARSER = new ObjectParser<>("cmd_line_args", true, Builder::new); + + static { + PARSER.declareString(Builder::setDocVectors, DOC_VECTORS_FIELD); + PARSER.declareString(Builder::setQueryVectors, QUERY_VECTORS_FIELD); + PARSER.declareInt(Builder::setNumDocs, NUM_DOCS_FIELD); + PARSER.declareInt(Builder::setNumQueries, NUM_QUERIES_FIELD); + PARSER.declareString(Builder::setIndexType, INDEX_TYPE_FIELD); + PARSER.declareInt(Builder::setNumCandidates, NUM_CANDIDATES_FIELD); + PARSER.declareInt(Builder::setK, K_FIELD); + PARSER.declareInt(Builder::setNProbe, N_PROBE_FIELD); + PARSER.declareInt(Builder::setIvfClusterSize, IVF_CLUSTER_SIZE_FIELD); + PARSER.declareInt(Builder::setOverSamplingFactor, OVER_SAMPLING_FACTOR_FIELD); + PARSER.declareInt(Builder::setHnswM, HNSW_M_FIELD); + PARSER.declareInt(Builder::setHnswEfConstruction, HNSW_EF_CONSTRUCTION_FIELD); + PARSER.declareInt(Builder::setSearchThreads, SEARCH_THREADS_FIELD); + PARSER.declareInt(Builder::setIndexThreads, INDEX_THREADS_FIELD); + PARSER.declareBoolean(Builder::setReindex, REINDEX_FIELD); + PARSER.declareBoolean(Builder::setForceMerge, FORCE_MERGE_FIELD); + PARSER.declareString(Builder::setVectorSpace, VECTOR_SPACE_FIELD); + PARSER.declareInt(Builder::setQuantizeBits, QUANTIZE_BITS_FIELD); + PARSER.declareString(Builder::setVectorEncoding, VECTOR_ENCODING_FIELD); + PARSER.declareInt(Builder::setDimensions, DIMENSIONS_FIELD); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + if (docVectors != null) { + builder.field(DOC_VECTORS_FIELD.getPreferredName(), docVectors.toString()); + } + if (queryVectors != null) { + builder.field(QUERY_VECTORS_FIELD.getPreferredName(), queryVectors.toString()); + } + builder.field(NUM_DOCS_FIELD.getPreferredName(), numDocs); + builder.field(NUM_QUERIES_FIELD.getPreferredName(), numQueries); + builder.field(INDEX_TYPE_FIELD.getPreferredName(), indexType.name().toLowerCase(Locale.ROOT)); + builder.field(NUM_CANDIDATES_FIELD.getPreferredName(), numCandidates); + builder.field(K_FIELD.getPreferredName(), k); + builder.field(N_PROBE_FIELD.getPreferredName(), nProbe); + builder.field(IVF_CLUSTER_SIZE_FIELD.getPreferredName(), ivfClusterSize); + builder.field(OVER_SAMPLING_FACTOR_FIELD.getPreferredName(), overSamplingFactor); + builder.field(HNSW_M_FIELD.getPreferredName(), hnswM); + builder.field(HNSW_EF_CONSTRUCTION_FIELD.getPreferredName(), hnswEfConstruction); + builder.field(SEARCH_THREADS_FIELD.getPreferredName(), searchThreads); + builder.field(INDEX_THREADS_FIELD.getPreferredName(), indexThreads); + builder.field(REINDEX_FIELD.getPreferredName(), reindex); + builder.field(FORCE_MERGE_FIELD.getPreferredName(), forceMerge); + builder.field(VECTOR_SPACE_FIELD.getPreferredName(), vectorSpace.name().toLowerCase(Locale.ROOT)); + builder.field(QUANTIZE_BITS_FIELD.getPreferredName(), quantizeBits); + builder.field(VECTOR_ENCODING_FIELD.getPreferredName(), vectorEncoding.name().toLowerCase(Locale.ROOT)); + builder.field(DIMENSIONS_FIELD.getPreferredName(), dimensions); + return builder.endObject(); + } + + @Override + public String toString() { + return Strings.toString(this, false, false); + } + + static class Builder { + private Path docVectors; + private Path queryVectors; + private int numDocs = 1000; + private int numQueries = 100; + private KnnIndexTester.IndexType indexType = KnnIndexTester.IndexType.HNSW; + private int numCandidates = 1000; + private int k = 10; + private int nProbe = 10; + private int ivfClusterSize = 1000; + private int overSamplingFactor = 1; + private int hnswM = 16; + private int hnswEfConstruction = 200; + private int searchThreads = 1; + private int indexThreads = 1; + private boolean reindex = false; + private boolean forceMerge = false; + private VectorSimilarityFunction vectorSpace = VectorSimilarityFunction.EUCLIDEAN; + private int quantizeBits = 8; + private VectorEncoding vectorEncoding = VectorEncoding.FLOAT32; + private int dimensions; + + public Builder setDocVectors(String docVectors) { + this.docVectors = PathUtils.get(docVectors); + return this; + } + + public Builder setQueryVectors(String queryVectors) { + this.queryVectors = PathUtils.get(queryVectors); + return this; + } + + public Builder setNumDocs(int numDocs) { + this.numDocs = numDocs; + return this; + } + + public Builder setNumQueries(int numQueries) { + this.numQueries = numQueries; + return this; + } + + public Builder setIndexType(String indexType) { + this.indexType = KnnIndexTester.IndexType.valueOf(indexType.toUpperCase(Locale.ROOT)); + return this; + } + + public Builder setNumCandidates(int numCandidates) { + this.numCandidates = numCandidates; + return this; + } + + public Builder setK(int k) { + this.k = k; + return this; + } + + public Builder setNProbe(int nProbe) { + this.nProbe = nProbe; + return this; + } + + public Builder setIvfClusterSize(int ivfClusterSize) { + this.ivfClusterSize = ivfClusterSize; + return this; + } + + public Builder setOverSamplingFactor(int overSamplingFactor) { + this.overSamplingFactor = overSamplingFactor; + return this; + } + + public Builder setHnswM(int hnswM) { + this.hnswM = hnswM; + return this; + } + + public Builder setHnswEfConstruction(int hnswEfConstruction) { + this.hnswEfConstruction = hnswEfConstruction; + return this; + } + + public Builder setSearchThreads(int searchThreads) { + this.searchThreads = searchThreads; + return this; + } + + public Builder setIndexThreads(int indexThreads) { + this.indexThreads = indexThreads; + return this; + } + + public Builder setReindex(boolean reindex) { + this.reindex = reindex; + return this; + } + + public Builder setForceMerge(boolean forceMerge) { + this.forceMerge = forceMerge; + return this; + } + + public Builder setVectorSpace(String vectorSpace) { + this.vectorSpace = VectorSimilarityFunction.valueOf(vectorSpace.toUpperCase(Locale.ROOT)); + return this; + } + + public Builder setQuantizeBits(int quantizeBits) { + this.quantizeBits = quantizeBits; + return this; + } + + public Builder setVectorEncoding(String vectorEncoding) { + this.vectorEncoding = VectorEncoding.valueOf(vectorEncoding.toUpperCase(Locale.ROOT)); + return this; + } + + public Builder setDimensions(int dimensions) { + this.dimensions = dimensions; + return this; + } + + public CmdLineArgs build() { + if (docVectors == null) { + throw new IllegalArgumentException("Document vectors path must be provided"); + } + if (dimensions <= 0) { + throw new IllegalArgumentException("dimensions must be a positive integer"); + } + return new CmdLineArgs( + docVectors, + queryVectors, + numDocs, + numQueries, + indexType, + numCandidates, + k, + nProbe, + ivfClusterSize, + overSamplingFactor, + hnswM, + hnswEfConstruction, + searchThreads, + indexThreads, + reindex, + forceMerge, + vectorSpace, + quantizeBits, + vectorEncoding, + dimensions + ); + } + } +} diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java new file mode 100644 index 000000000000..6aa2e051bacc --- /dev/null +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java @@ -0,0 +1,399 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.test.knn; + +import com.sun.management.ThreadMXBean; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.core.PathUtils; +import org.elasticsearch.index.codec.vectors.ES813Int8FlatVectorFormat; +import org.elasticsearch.index.codec.vectors.ES814HnswScalarQuantizedVectorsFormat; +import org.elasticsearch.index.codec.vectors.IVFVectorsFormat; +import org.elasticsearch.index.codec.vectors.es818.ES818BinaryQuantizedVectorsFormat; +import org.elasticsearch.index.codec.vectors.es818.ES818HnswBinaryQuantizedVectorsFormat; +import org.elasticsearch.logging.Level; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; + +import java.io.InputStream; +import java.lang.management.ThreadInfo; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +/** + * A utility class to create and test KNN indices using Lucene. + * It supports various index types (HNSW, FLAT, IVF) and configurations. + */ +public class KnnIndexTester { + static final Level LOG_LEVEL = Level.DEBUG; + + static final SysOutLogger logger = new SysOutLogger(); + + static { + LogConfigurator.loadLog4jPlugins(); + LogConfigurator.configureESLogging(); // native access requires logging to be initialized + } + + static final String INDEX_DIR = "target/knn_index"; + + enum IndexType { + HNSW, + FLAT, + IVF + } + + private static String formatIndexPath(CmdLineArgs args) { + List suffix = new ArrayList<>(); + if (args.indexType() == IndexType.FLAT) { + suffix.add("flat"); + } else if (args.indexType() == IndexType.IVF) { + suffix.add("ivf"); + suffix.add(Integer.toString(args.ivfClusterSize())); + } else { + suffix.add(Integer.toString(args.hnswM())); + suffix.add(Integer.toString(args.hnswEfConstruction())); + if (args.quantizeBits() < 32) { + suffix.add(Integer.toString(args.quantizeBits())); + } + } + return INDEX_DIR + "/" + args.docVectors().getFileName() + "-" + String.join("-", suffix) + ".index"; + } + + static Codec createCodec(CmdLineArgs args) { + final KnnVectorsFormat format; + if (args.indexType() == IndexType.IVF) { + format = new IVFVectorsFormat(args.ivfClusterSize()); + } else { + if (args.quantizeBits() == 1) { + if (args.indexType() == IndexType.FLAT) { + format = new ES818BinaryQuantizedVectorsFormat(); + } else { + format = new ES818HnswBinaryQuantizedVectorsFormat(args.hnswM(), args.hnswEfConstruction(), 1, null); + } + } else if (args.quantizeBits() < 32) { + if (args.indexType() == IndexType.FLAT) { + format = new ES813Int8FlatVectorFormat(null, args.quantizeBits(), true); + } else { + format = new ES814HnswScalarQuantizedVectorsFormat( + args.hnswM(), + args.hnswEfConstruction(), + null, + args.quantizeBits(), + true + ); + } + } else { + format = new Lucene99HnswVectorsFormat(args.hnswM(), args.hnswEfConstruction(), 1, null); + } + } + return new Lucene101Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return format; + } + }; + } + + /** + * Main method to run the KNN index tester. + * It parses command line arguments, creates the index, and runs searches if specified. + * + * @param args Command line arguments + * @throws Exception If an error occurs during index creation or search + */ + public static void main(String[] args) throws Exception { + if (args.length != 1 || args[0].equals("--help") || args[0].equals("-h")) { + // printout an example configuration formatted file and indicate that it is required + System.out.println("Usage: java -cp org.elasticsearch.test.knn.KnnIndexTester "); + System.out.println("Where is a JSON file containing one or more configurations for the KNN index tester."); + System.out.println("An example configuration object: "); + System.out.println( + Strings.toString( + new CmdLineArgs.Builder().setDimensions(64) + .setDocVectors("/doc/vectors/path") + .setQueryVectors("/query/vectors/path") + .build(), + true, + true + ) + ); + return; + } + String jsonConfig = args[0]; + // Parse command line arguments + Path jsonConfigPath = PathUtils.get(jsonConfig); + if (Files.exists(jsonConfigPath) == false) { + throw new IllegalArgumentException("JSON config file does not exist: " + jsonConfigPath); + } + // Parse the JSON config file to get command line arguments + // This assumes that CmdLineArgs.fromXContent is implemented to parse the JSON file + List cmdLineArgsList = new ArrayList<>(); + try ( + InputStream jsonStream = Files.newInputStream(jsonConfigPath); + XContentParser parser = XContentType.JSON.xContent().createParser(XContentParserConfiguration.EMPTY, jsonStream) + ) { + // check if the parser is at the start of an object if so, we only have one set of arguments + if (parser.currentToken() == null && parser.nextToken() == XContentParser.Token.START_OBJECT) { + cmdLineArgsList.add(CmdLineArgs.fromXContent(parser)); + } else if (parser.currentToken() == XContentParser.Token.START_ARRAY) { + // if the parser is at the start of an array, we have multiple sets of arguments + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + cmdLineArgsList.add(CmdLineArgs.fromXContent(parser)); + } + } else { + throw new IllegalArgumentException("Invalid JSON format in config file: " + jsonConfigPath); + } + } + FormattedResults formattedResults = new FormattedResults(); + for (CmdLineArgs cmdLineArgs : cmdLineArgsList) { + Results result = new Results(cmdLineArgs.indexType().name().toLowerCase(Locale.ROOT), cmdLineArgs.numDocs()); + System.out.println("Running KNN index tester with arguments: " + cmdLineArgs); + Codec codec = createCodec(cmdLineArgs); + Path indexPath = PathUtils.get(formatIndexPath(cmdLineArgs)); + if (cmdLineArgs.reindex() || cmdLineArgs.forceMerge()) { + KnnIndexer knnIndexer = new KnnIndexer( + cmdLineArgs.docVectors(), + indexPath, + codec, + cmdLineArgs.indexThreads(), + cmdLineArgs.vectorEncoding(), + cmdLineArgs.dimensions(), + cmdLineArgs.vectorSpace(), + cmdLineArgs.numDocs() + ); + if (Files.exists(indexPath) == false) { + if (cmdLineArgs.reindex() == false) { + throw new IllegalArgumentException("Index path does not exist: " + indexPath); + } + if (cmdLineArgs.forceMerge()) { + throw new IllegalArgumentException("Force merging without an existing index in: " + indexPath); + } + } + if (cmdLineArgs.reindex()) { + knnIndexer.createIndex(result); + } + if (cmdLineArgs.forceMerge()) { + knnIndexer.forceMerge(result); + } else { + knnIndexer.numSegments(result); + } + } + if (cmdLineArgs.queryVectors() != null) { + KnnSearcher knnSearcher = new KnnSearcher(indexPath, cmdLineArgs); + knnSearcher.runSearch(result); + } + formattedResults.results.add(result); + } + System.out.println("Results:"); + System.out.println(formattedResults); + } + + static class FormattedResults { + List results = new ArrayList<>(); + + @Override + public String toString() { + if (results.isEmpty()) { + return "No results available."; + } + + // Define column headers + String[] headers = { + "index_type", + "num_docs", + "index_time(ms)", + "force_merge_time(ms)", + "num_segments", + "latency(ms)", + "net_cpu_time(ms)", + "avg_cpu_count", + "QPS", + "recall", + "visited" }; + + // Calculate appropriate column widths based on headers and data + int[] widths = calculateColumnWidths(headers); + + StringBuilder sb = new StringBuilder(); + + // Format and append header + sb.append(formatRow(headers, widths)); + sb.append("\n"); + + // Add separator line + for (int width : widths) { + sb.append("-".repeat(width)).append(" "); + } + sb.append("\n"); + + // Format and append each row of data + for (Results result : results) { + String[] rowData = { + result.indexType, + Integer.toString(result.numDocs), + Long.toString(result.indexTimeMS), + Long.toString(result.forceMergeTimeMS), + Integer.toString(result.numSegments), + String.format(Locale.ROOT, "%.2f", result.avgLatency), + String.format(Locale.ROOT, "%.2f", result.netCpuTimeMS), + String.format(Locale.ROOT, "%.2f", result.avgCpuCount), + String.format(Locale.ROOT, "%.2f", result.qps), + String.format(Locale.ROOT, "%.2f", result.avgRecall), + String.format(Locale.ROOT, "%.2f", result.averageVisited) }; + sb.append(formatRow(rowData, widths)); + sb.append("\n"); + } + + return sb.toString(); + } + + // Helper method to format a single row with proper column widths + private String formatRow(String[] values, int[] widths) { + StringBuilder row = new StringBuilder(); + for (int i = 0; i < values.length; i++) { + // Left-align text column (index_type), right-align numeric columns + String format = (i == 0) ? "%-" + widths[i] + "s" : "%" + widths[i] + "s"; + row.append(Strings.format(format, values[i])); + + // Add separation between columns + if (i < values.length - 1) { + row.append(" "); + } + } + return row.toString(); + } + + // Calculate appropriate column widths based on headers and data + private int[] calculateColumnWidths(String[] headers) { + int[] widths = new int[headers.length]; + + // Initialize widths with header lengths + for (int i = 0; i < headers.length; i++) { + widths[i] = headers[i].length(); + } + + // Update widths based on data + for (Results result : results) { + String[] values = { + result.indexType, + Integer.toString(result.numDocs), + Long.toString(result.indexTimeMS), + Long.toString(result.forceMergeTimeMS), + Integer.toString(result.numSegments), + String.format(Locale.ROOT, "%.2f", result.avgLatency), + String.format(Locale.ROOT, "%.2f", result.netCpuTimeMS), + String.format(Locale.ROOT, "%.2f", result.avgCpuCount), + String.format(Locale.ROOT, "%.2f", result.qps), + String.format(Locale.ROOT, "%.2f", result.avgRecall), + String.format(Locale.ROOT, "%.2f", result.averageVisited) }; + + for (int i = 0; i < values.length; i++) { + widths[i] = Math.max(widths[i], values[i].length()); + } + } + + return widths; + } + } + + static class Results { + final String indexType; + final int numDocs; + long indexTimeMS; + long forceMergeTimeMS; + int numSegments; + double avgLatency; + double qps; + double avgRecall; + double averageVisited; + double netCpuTimeMS; + double avgCpuCount; + + Results(String indexType, int numDocs) { + this.indexType = indexType; + this.numDocs = numDocs; + } + } + + static final class SysOutLogger { + + void warn(String message) { + if (LOG_LEVEL.ordinal() >= Level.WARN.ordinal()) { + System.out.println(message); + } + } + + void warn(String message, Object... params) { + if (LOG_LEVEL.ordinal() >= Level.WARN.ordinal()) { + System.out.println(String.format(Locale.ROOT, message, params)); + } + } + + void info(String message) { + if (LOG_LEVEL.ordinal() >= Level.INFO.ordinal()) { + System.out.println(message); + } + } + + void info(String message, Object... params) { + if (LOG_LEVEL.ordinal() >= Level.INFO.ordinal()) { + System.out.println(String.format(Locale.ROOT, message, params)); + } + } + + void debug(String message) { + if (LOG_LEVEL.ordinal() >= Level.DEBUG.ordinal()) { + System.out.println(message); + } + } + + void debug(String message, Object... params) { + if (LOG_LEVEL.ordinal() >= Level.DEBUG.ordinal()) { + System.out.println(String.format(Locale.ROOT, message, params)); + } + } + + void trace(String message) { + if (LOG_LEVEL == Level.TRACE) { + System.out.println(message); + } + } + + void trace(String message, Object... params) { + if (LOG_LEVEL == Level.TRACE) { + System.out.println(String.format(Locale.ROOT, message, params)); + } + } + } + + static final class ThreadDetails { + private static final ThreadMXBean threadBean = (ThreadMXBean) java.lang.management.ManagementFactory.getThreadMXBean(); + public final long[] threadIDs; + public final long[] cpuTimesNS; + public final ThreadInfo[] threadInfos; + public final long ns; + + ThreadDetails() { + ns = System.nanoTime(); + threadIDs = threadBean.getAllThreadIds(); + cpuTimesNS = threadBean.getThreadCpuTime(threadIDs); + threadInfos = threadBean.getThreadInfo(threadIDs); + } + } +} diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java new file mode 100644 index 000000000000..07ee4975df7e --- /dev/null +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java @@ -0,0 +1,329 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * a copy and modification from Lucene util + * Modifications copyright (C) 2025 Elasticsearch B.V. + */ + +package org.elasticsearch.test.knn; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.ConcurrentMergeScheduler; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.PrintStreamInfoStream; +import org.elasticsearch.common.io.Channels; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.elasticsearch.test.knn.KnnIndexTester.logger; + +class KnnIndexer { + private static final double WRITER_BUFFER_MB = 128; + static final String ID_FIELD = "id"; + static final String VECTOR_FIELD = "vector"; + + private final Path docsPath; + private final Path indexPath; + private final VectorEncoding vectorEncoding; + private final int dim; + private final VectorSimilarityFunction similarityFunction; + private final Codec codec; + private final int numDocs; + private final int numIndexThreads; + + KnnIndexer( + Path docsPath, + Path indexPath, + Codec codec, + int numIndexThreads, + VectorEncoding vectorEncoding, + int dim, + VectorSimilarityFunction similarityFunction, + int numDocs + ) { + this.docsPath = docsPath; + this.indexPath = indexPath; + this.codec = codec; + this.numIndexThreads = numIndexThreads; + this.vectorEncoding = vectorEncoding; + this.dim = dim; + this.similarityFunction = similarityFunction; + this.numDocs = numDocs; + } + + void numSegments(KnnIndexTester.Results result) { + try (FSDirectory dir = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(dir)) { + result.numSegments = reader.leaves().size(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to get segment count for index at " + indexPath, e); + } + } + + void createIndex(KnnIndexTester.Results result) throws IOException, InterruptedException, ExecutionException { + IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE); + iwc.setCodec(codec); + iwc.setRAMBufferSizeMB(WRITER_BUFFER_MB); + iwc.setUseCompoundFile(false); + + iwc.setMaxFullFlushMergeWaitMillis(0); + + FieldType fieldType = switch (vectorEncoding) { + case BYTE -> KnnByteVectorField.createFieldType(dim, similarityFunction); + case FLOAT32 -> KnnFloatVectorField.createFieldType(dim, similarityFunction); + }; + iwc.setInfoStream(new PrintStreamInfoStream(System.out) { + @Override + public boolean isEnabled(String component) { + return Objects.equals(component, "IVF"); + } + }); + logger.debug( + "KnnIndexer: using codec=%s, vectorEncoding=%s, dim=%d, similarityFunction=%s", + codec.getName(), + vectorEncoding, + dim, + similarityFunction + ); + + if (Files.exists(indexPath)) { + logger.debug("KnnIndexer: existing index at %s", indexPath); + } else { + Files.createDirectories(indexPath); + } + + long start = System.nanoTime(); + try ( + FSDirectory dir = FSDirectory.open(indexPath); + IndexWriter iw = new IndexWriter(dir, iwc); + FileChannel in = FileChannel.open(docsPath) + ) { + long docsPathSizeInBytes = in.size(); + if (docsPathSizeInBytes % ((long) dim * vectorEncoding.byteSize) != 0) { + throw new IllegalArgumentException( + "docsPath \"" + docsPath + "\" does not contain a whole number of vectors? size=" + docsPathSizeInBytes + ); + } + logger.info( + "docsPathSizeInBytes=%d, dim=%d, vectorEncoding=%s, byteSize=%d", + docsPathSizeInBytes, + dim, + vectorEncoding, + vectorEncoding.byteSize + ); + + VectorReader inReader = VectorReader.create(in, dim, vectorEncoding); + try (ExecutorService exec = Executors.newFixedThreadPool(numIndexThreads, r -> new Thread(r, "KnnIndexer-Thread"))) { + AtomicInteger numDocsIndexed = new AtomicInteger(); + List> threads = new ArrayList<>(); + for (int i = 0; i < numIndexThreads; i++) { + Thread t = new IndexerThread(iw, inReader, dim, vectorEncoding, fieldType, numDocsIndexed, numDocs); + t.setDaemon(true); + threads.add(exec.submit(t)); + } + for (Future t : threads) { + t.get(); + } + } + logger.debug("all indexing threads finished, now IndexWriter.commit()"); + iw.commit(); + ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler(); + cms.sync(); + } + + long elapsed = System.nanoTime() - start; + logger.debug("Indexing took %d ms for %d docs", TimeUnit.NANOSECONDS.toMillis(elapsed), numDocs); + result.indexTimeMS = TimeUnit.NANOSECONDS.toMillis(elapsed); + } + + void forceMerge(KnnIndexTester.Results results) throws Exception { + IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.APPEND); + iwc.setInfoStream(new PrintStreamInfoStream(System.out) { + @Override + public boolean isEnabled(String component) { + return Objects.equals(component, "IVF"); + } + }); + iwc.setCodec(codec); + logger.debug("KnnIndexer: forceMerge in %s", indexPath); + long startNS = System.nanoTime(); + try (IndexWriter iw = new IndexWriter(FSDirectory.open(indexPath), iwc)) { + iw.forceMerge(1); + } + long endNS = System.nanoTime(); + long elapsedNSec = (endNS - startNS); + logger.info("forceMerge took %d ms", TimeUnit.NANOSECONDS.toMillis(elapsedNSec)); + results.forceMergeTimeMS = TimeUnit.NANOSECONDS.toMillis(elapsedNSec); + } + + static class IndexerThread extends Thread { + private final IndexWriter iw; + private final AtomicInteger numDocsIndexed; + private final int numDocsToIndex; + private final FieldType fieldType; + private final VectorEncoding vectorEncoding; + private final byte[] byteVectorBuffer; + private final float[] floatVectorBuffer; + private final VectorReader in; + + private IndexerThread( + IndexWriter iw, + VectorReader in, + int dims, + VectorEncoding vectorEncoding, + FieldType fieldType, + AtomicInteger numDocsIndexed, + int numDocsToIndex + ) { + this.iw = iw; + this.in = in; + this.vectorEncoding = vectorEncoding; + this.fieldType = fieldType; + this.numDocsIndexed = numDocsIndexed; + this.numDocsToIndex = numDocsToIndex; + switch (vectorEncoding) { + case BYTE -> { + byteVectorBuffer = new byte[dims]; + floatVectorBuffer = null; + } + case FLOAT32 -> { + floatVectorBuffer = new float[dims]; + byteVectorBuffer = null; + } + default -> throw new IllegalArgumentException("unexpected vector encoding: " + vectorEncoding); + } + } + + @Override + public void run() { + try { + _run(); + } catch (IOException ioe) { + throw new UncheckedIOException(ioe); + } + } + + private void _run() throws IOException { + while (true) { + int id = numDocsIndexed.getAndIncrement(); + if (id >= numDocsToIndex) { + break; + } + + Document doc = new Document(); + switch (vectorEncoding) { + case BYTE -> { + in.next(byteVectorBuffer); + doc.add(new KnnByteVectorField(VECTOR_FIELD, byteVectorBuffer, fieldType)); + } + case FLOAT32 -> { + in.next(floatVectorBuffer); + doc.add(new KnnFloatVectorField(VECTOR_FIELD, floatVectorBuffer, fieldType)); + } + } + + if ((id + 1) % 25000 == 0) { + logger.debug("Done indexing " + (id + 1) + " documents."); + } + doc.add(new StoredField(ID_FIELD, id)); + iw.addDocument(doc); + } + } + } + + static class VectorReader { + final float[] target; + final ByteBuffer bytes; + final FileChannel input; + long position; + + static VectorReader create(FileChannel input, int dim, VectorEncoding vectorEncoding) throws IOException { + int bufferSize = dim * vectorEncoding.byteSize; + if (input.size() % ((long) dim * vectorEncoding.byteSize) != 0) { + throw new IllegalArgumentException( + "vectors file \"" + input + "\" does not contain a whole number of vectors? size=" + input.size() + ); + } + return new VectorReader(input, dim, bufferSize); + } + + VectorReader(FileChannel input, int dim, int bufferSize) throws IOException { + this.bytes = ByteBuffer.wrap(new byte[bufferSize]).order(ByteOrder.LITTLE_ENDIAN); + this.input = input; + this.target = new float[dim]; + reset(); + } + + void reset() throws IOException { + position = 0; + input.position(position); + } + + private void readNext() throws IOException { + int bytesRead = Channels.readFromFileChannel(this.input, position, bytes); + if (bytesRead < bytes.capacity()) { + position = 0; + bytes.position(0); + // wrap around back to the start of the file if we hit the end: + logger.warn("VectorReader hit EOF when reading " + this.input + "; now wrapping around to start of file again"); + this.input.position(position); + bytesRead = Channels.readFromFileChannel(this.input, position, bytes); + if (bytesRead < bytes.capacity()) { + throw new IllegalStateException( + "vector file " + input + " doesn't even have enough bytes for a single vector? got bytesRead=" + bytesRead + ); + } + } + position += bytesRead; + bytes.position(0); + } + + synchronized void next(float[] dest) throws IOException { + readNext(); + bytes.asFloatBuffer().get(dest); + } + + synchronized void next(byte[] dest) throws IOException { + readNext(); + bytes.get(dest); + } + } +} diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnSearcher.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnSearcher.java new file mode 100644 index 000000000000..b0738a6ea5bf --- /dev/null +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnSearcher.java @@ -0,0 +1,488 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * a copy and modification from Lucene util + * Modifications copyright (C) 2025 Elasticsearch B.V. + */ + +package org.elasticsearch.test.knn; + +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.queries.function.FunctionQuery; +import org.apache.lucene.queries.function.valuesource.ByteKnnVectorFieldSource; +import org.apache.lucene.queries.function.valuesource.ByteVectorSimilarityFunction; +import org.apache.lucene.queries.function.valuesource.ConstKnnByteVectorValueSource; +import org.apache.lucene.queries.function.valuesource.ConstKnnFloatValueSource; +import org.apache.lucene.queries.function.valuesource.FloatKnnVectorFieldSource; +import org.apache.lucene.queries.function.valuesource.FloatVectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.MMapDirectory; +import org.elasticsearch.core.PathUtils; +import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; +import org.elasticsearch.search.profile.query.QueryProfiler; +import org.elasticsearch.search.vectors.ESKnnByteVectorQuery; +import org.elasticsearch.search.vectors.ESKnnFloatVectorQuery; +import org.elasticsearch.search.vectors.IVFKnnFloatVectorQuery; +import org.elasticsearch.search.vectors.QueryProfilerProvider; +import org.elasticsearch.search.vectors.RescoreKnnVectorQuery; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.IntBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.FileTime; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.elasticsearch.test.knn.KnnIndexTester.logger; +import static org.elasticsearch.test.knn.KnnIndexer.ID_FIELD; +import static org.elasticsearch.test.knn.KnnIndexer.VECTOR_FIELD; + +class KnnSearcher { + + private final Path docPath; + private final Path indexPath; + private final Path queryPath; + private final int numDocs; + private final int numQueryVectors; + private final long randomSeed = 42; + private final float selectivity = 1f; + private final int topK; + private final int efSearch; + private final int nProbe; + private final KnnIndexTester.IndexType indexType; + private final int dim; + private final VectorSimilarityFunction similarityFunction; + private final VectorEncoding vectorEncoding; + private final float overSamplingFactor; + private final int searchThreads; + + KnnSearcher(Path indexPath, CmdLineArgs cmdLineArgs) { + this.docPath = cmdLineArgs.docVectors(); + this.indexPath = indexPath; + this.queryPath = cmdLineArgs.queryVectors(); + this.numDocs = cmdLineArgs.numDocs(); + this.numQueryVectors = cmdLineArgs.numQueries(); + this.topK = cmdLineArgs.k(); + this.dim = cmdLineArgs.dimensions(); + this.similarityFunction = cmdLineArgs.vectorSpace(); + this.vectorEncoding = cmdLineArgs.vectorEncoding(); + this.overSamplingFactor = cmdLineArgs.overSamplingFactor(); + if (numQueryVectors <= 0) { + throw new IllegalArgumentException("numQueryVectors must be > 0"); + } + this.efSearch = cmdLineArgs.numCandidates(); + this.nProbe = cmdLineArgs.nProbe(); + this.indexType = cmdLineArgs.indexType(); + this.searchThreads = cmdLineArgs.searchThreads(); + } + + void runSearch(KnnIndexTester.Results finalResults) throws IOException { + TopDocs[] results = new TopDocs[numQueryVectors]; + int[][] resultIds = new int[numQueryVectors][]; + long elapsed, totalCpuTimeMS, totalVisited = 0; + try ( + FileChannel input = FileChannel.open(queryPath); + ExecutorService executorService = Executors.newFixedThreadPool(searchThreads, r -> new Thread(r, "KnnSearcher-Thread")) + ) { + long queryPathSizeInBytes = input.size(); + logger.info( + "queryPath size: " + + queryPathSizeInBytes + + " bytes, assuming vector count is " + + (queryPathSizeInBytes / ((long) dim * vectorEncoding.byteSize)) + ); + KnnIndexer.VectorReader targetReader = KnnIndexer.VectorReader.create(input, dim, vectorEncoding); + long startNS; + try (MMapDirectory dir = new MMapDirectory(indexPath)) { + try (DirectoryReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = searchThreads > 1 ? new IndexSearcher(reader, executorService) : new IndexSearcher(reader); + byte[] targetBytes = new byte[dim]; + float[] target = new float[dim]; + // warm up + for (int i = 0; i < numQueryVectors; i++) { + if (vectorEncoding.equals(VectorEncoding.BYTE)) { + targetReader.next(targetBytes); + doVectorQuery(targetBytes, searcher); + } else { + targetReader.next(target); + doVectorQuery(target, searcher); + } + } + targetReader.reset(); + startNS = System.nanoTime(); + KnnIndexTester.ThreadDetails startThreadDetails = new KnnIndexTester.ThreadDetails(); + for (int i = 0; i < numQueryVectors; i++) { + if (vectorEncoding.equals(VectorEncoding.BYTE)) { + targetReader.next(targetBytes); + results[i] = doVectorQuery(targetBytes, searcher); + } else { + targetReader.next(target); + results[i] = doVectorQuery(target, searcher); + } + } + KnnIndexTester.ThreadDetails endThreadDetails = new KnnIndexTester.ThreadDetails(); + elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNS); + long startCPUTimeNS = 0; + long endCPUTimeNS = 0; + for (int i = 0; i < startThreadDetails.threadInfos.length; i++) { + if (startThreadDetails.threadInfos[i].getThreadName().startsWith("KnnSearcher-Thread")) { + startCPUTimeNS += startThreadDetails.cpuTimesNS[i]; + } + } + + for (int i = 0; i < endThreadDetails.threadInfos.length; i++) { + if (endThreadDetails.threadInfos[i].getThreadName().startsWith("KnnSearcher-Thread")) { + endCPUTimeNS += endThreadDetails.cpuTimesNS[i]; + } + } + totalCpuTimeMS = TimeUnit.NANOSECONDS.toMillis(endCPUTimeNS - startCPUTimeNS); + + // Fetch, validate and write result document ids. + StoredFields storedFields = reader.storedFields(); + for (int i = 0; i < numQueryVectors; i++) { + totalVisited += results[i].totalHits.value(); + resultIds[i] = getResultIds(results[i], storedFields); + } + logger.info( + "completed %d searches in %d ms: %d QPS CPU time=%dms", + numQueryVectors, + elapsed, + (1000L * numQueryVectors) / elapsed, + totalCpuTimeMS + ); + } + } + } + logger.info("checking results"); + int[][] nn = getOrCalculateExactNN(); + finalResults.avgRecall = checkResults(resultIds, nn, topK); + finalResults.qps = (1000f * numQueryVectors) / elapsed; + finalResults.avgLatency = (float) elapsed / numQueryVectors; + finalResults.averageVisited = (double) totalVisited / numQueryVectors; + finalResults.netCpuTimeMS = (double) totalCpuTimeMS / numQueryVectors; + finalResults.avgCpuCount = (double) totalCpuTimeMS / elapsed; + } + + private int[][] getOrCalculateExactNN() throws IOException { + // look in working directory for cached nn file + String hash = Integer.toString( + Objects.hash( + docPath, + indexPath, + queryPath, + numDocs, + numQueryVectors, + topK, + similarityFunction.ordinal(), + selectivity, + randomSeed + ), + 36 + ); + String nnFileName = "nn-" + hash + ".bin"; + Path nnPath = PathUtils.get("target/" + nnFileName); + if (Files.exists(nnPath) && isNewer(nnPath, docPath, indexPath, queryPath)) { + logger.info("read pre-cached exact match vectors from cache file \"" + nnPath + "\""); + return readExactNN(nnPath); + } else { + logger.info("computing brute-force exact KNN matches for " + numQueryVectors + " query vectors from \"" + queryPath + "\""); + long startNS = System.nanoTime(); + // TODO: enable computing NN from high precision vectors when + // checking low-precision recall + int[][] nn; + if (vectorEncoding.equals(VectorEncoding.BYTE)) { + nn = computeExactNNByte(queryPath); + } else { + nn = computeExactNN(queryPath); + } + writeExactNN(nn, nnPath); + long elapsedMS = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNS); // ns -> ms + logger.info("computed " + numQueryVectors + " exact matches in " + elapsedMS + " ms"); + return nn; + } + } + + private boolean isNewer(Path path, Path... others) throws IOException { + FileTime modified = Files.getLastModifiedTime(path); + for (Path other : others) { + if (Files.getLastModifiedTime(other).compareTo(modified) >= 0) { + return false; + } + } + return true; + } + + TopDocs doVectorQuery(byte[] vector, IndexSearcher searcher) throws IOException { + Query knnQuery; + if (overSamplingFactor > 1f) { + throw new IllegalArgumentException("oversampling factor > 1 is not supported for byte vectors"); + } + if (indexType == KnnIndexTester.IndexType.IVF) { + throw new IllegalArgumentException("IVF index type does not support byte vectors"); + } else { + knnQuery = new ESKnnByteVectorQuery( + VECTOR_FIELD, + vector, + topK, + efSearch, + null, + DenseVectorFieldMapper.FilterHeuristic.ACORN.getKnnSearchStrategy() + ); + } + QueryProfiler profiler = new QueryProfiler(); + TopDocs docs = searcher.search(knnQuery, this.topK); + QueryProfilerProvider queryProfilerProvider = (QueryProfilerProvider) knnQuery; + queryProfilerProvider.profile(profiler); + return new TopDocs(new TotalHits(profiler.getVectorOpsCount(), docs.totalHits.relation()), docs.scoreDocs); + } + + TopDocs doVectorQuery(float[] vector, IndexSearcher searcher) throws IOException { + Query knnQuery; + int topK = this.topK; + int efSearch = this.efSearch; + if (overSamplingFactor > 1f) { + // oversample the topK results to get more candidates for the final result + topK = (int) Math.ceil(topK * overSamplingFactor); + efSearch = Math.max(topK, efSearch); + } + if (indexType == KnnIndexTester.IndexType.IVF) { + knnQuery = new IVFKnnFloatVectorQuery(VECTOR_FIELD, vector, topK, efSearch, null, nProbe); + } else { + knnQuery = new ESKnnFloatVectorQuery( + VECTOR_FIELD, + vector, + topK, + efSearch, + null, + DenseVectorFieldMapper.FilterHeuristic.ACORN.getKnnSearchStrategy() + ); + } + if (overSamplingFactor > 1f) { + // oversample the topK results to get more candidates for the final result + knnQuery = new RescoreKnnVectorQuery(VECTOR_FIELD, vector, similarityFunction, this.topK, knnQuery); + } + QueryProfiler profiler = new QueryProfiler(); + TopDocs docs = searcher.search(knnQuery, this.topK); + QueryProfilerProvider queryProfilerProvider = (QueryProfilerProvider) knnQuery; + queryProfilerProvider.profile(profiler); + return new TopDocs(new TotalHits(profiler.getVectorOpsCount(), docs.totalHits.relation()), docs.scoreDocs); + } + + private static float checkResults(int[][] results, int[][] nn, int topK) { + int totalMatches = 0; + int totalResults = results.length * topK; + for (int i = 0; i < results.length; i++) { + totalMatches += compareNN(nn[i], results[i], topK); + } + return totalMatches / (float) totalResults; + } + + private static int compareNN(int[] expected, int[] results, int topK) { + int matched = 0; + Set expectedSet = new HashSet<>(); + Set alreadySeen = new HashSet<>(); + for (int i = 0; i < topK; i++) { + expectedSet.add(expected[i]); + } + for (int docId : results) { + if (alreadySeen.add(docId) == false) { + throw new IllegalStateException("duplicate docId=" + docId); + } + if (expectedSet.contains(docId)) { + ++matched; + } + } + return matched; + } + + private int[][] readExactNN(Path nnPath) throws IOException { + int[][] result = new int[numQueryVectors][]; + try (FileChannel in = FileChannel.open(nnPath)) { + IntBuffer intBuffer = in.map(FileChannel.MapMode.READ_ONLY, 0, (long) numQueryVectors * topK * Integer.BYTES) + .order(ByteOrder.LITTLE_ENDIAN) + .asIntBuffer(); + for (int i = 0; i < numQueryVectors; i++) { + result[i] = new int[topK]; + intBuffer.get(result[i]); + } + } + return result; + } + + private void writeExactNN(int[][] nn, Path nnPath) throws IOException { + logger.info("writing true nearest neighbors to cache file \"" + nnPath + "\""); + ByteBuffer tmp = ByteBuffer.allocate(nn[0].length * Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN); + try (OutputStream out = Files.newOutputStream(nnPath)) { + for (int i = 0; i < numQueryVectors; i++) { + tmp.asIntBuffer().put(nn[i]); + out.write(tmp.array()); + } + } + } + + private int[][] computeExactNN(Path queryPath) throws IOException { + int[][] result = new int[numQueryVectors][]; + try (Directory dir = FSDirectory.open(indexPath); DirectoryReader reader = DirectoryReader.open(dir)) { + List> tasks = new ArrayList<>(); + try (FileChannel qIn = FileChannel.open(queryPath)) { + KnnIndexer.VectorReader queryReader = KnnIndexer.VectorReader.create(qIn, dim, VectorEncoding.FLOAT32); + for (int i = 0; i < numQueryVectors; i++) { + float[] queryVector = new float[dim]; + queryReader.next(queryVector); + tasks.add(new ComputeNNFloatTask(i, topK, queryVector, result, reader, similarityFunction)); + } + ForkJoinPool.commonPool().invokeAll(tasks); + } + return result; + } + } + + private int[][] computeExactNNByte(Path queryPath) throws IOException { + int[][] result = new int[numQueryVectors][]; + try (Directory dir = FSDirectory.open(indexPath); DirectoryReader reader = DirectoryReader.open(dir)) { + List> tasks = new ArrayList<>(); + try (FileChannel qIn = FileChannel.open(queryPath)) { + KnnIndexer.VectorReader queryReader = KnnIndexer.VectorReader.create(qIn, dim, VectorEncoding.BYTE); + for (int i = 0; i < numQueryVectors; i++) { + byte[] queryVector = new byte[dim]; + queryReader.next(queryVector); + tasks.add(new ComputeNNByteTask(i, queryVector, result, reader, similarityFunction)); + } + ForkJoinPool.commonPool().invokeAll(tasks); + } + return result; + } + } + + static class ComputeNNFloatTask implements Callable { + + private final int queryOrd; + private final float[] query; + private final int[][] result; + private final IndexReader reader; + private final VectorSimilarityFunction similarityFunction; + private final int topK; + + ComputeNNFloatTask( + int queryOrd, + int topK, + float[] query, + int[][] result, + IndexReader reader, + VectorSimilarityFunction similarityFunction + ) { + this.queryOrd = queryOrd; + this.query = query; + this.result = result; + this.reader = reader; + this.similarityFunction = similarityFunction; + this.topK = topK; + } + + @Override + public Void call() { + IndexSearcher searcher = new IndexSearcher(reader); + try { + var queryVector = new ConstKnnFloatValueSource(query); + var docVectors = new FloatKnnVectorFieldSource(VECTOR_FIELD); + Query query = new FunctionQuery(new FloatVectorSimilarityFunction(similarityFunction, queryVector, docVectors)); + var topDocs = searcher.search(query, topK); + result[queryOrd] = getResultIds(topDocs, reader.storedFields()); + if ((queryOrd + 1) % 10 == 0) { + logger.info(" exact knn scored " + (queryOrd + 1)); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return null; + } + } + + static class ComputeNNByteTask implements Callable { + + private final int queryOrd; + private final byte[] query; + private final int[][] result; + private final IndexReader reader; + private final VectorSimilarityFunction similarityFunction; + + ComputeNNByteTask(int queryOrd, byte[] query, int[][] result, IndexReader reader, VectorSimilarityFunction similarityFunction) { + this.queryOrd = queryOrd; + this.query = query; + this.result = result; + this.reader = reader; + this.similarityFunction = similarityFunction; + } + + @Override + public Void call() { + IndexSearcher searcher = new IndexSearcher(reader); + int topK = result[0].length; + try { + var queryVector = new ConstKnnByteVectorValueSource(query); + var docVectors = new ByteKnnVectorFieldSource(VECTOR_FIELD); + Query query = new FunctionQuery(new ByteVectorSimilarityFunction(similarityFunction, queryVector, docVectors)); + var topDocs = searcher.search(query, topK); + result[queryOrd] = getResultIds(topDocs, reader.storedFields()); + if ((queryOrd + 1) % 10 == 0) { + logger.info(" exact knn scored " + (queryOrd + 1)); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return null; + } + } + + static int[] getResultIds(TopDocs topDocs, StoredFields storedFields) throws IOException { + int[] resultIds = new int[topDocs.scoreDocs.length]; + int i = 0; + for (ScoreDoc doc : topDocs.scoreDocs) { + if (doc.doc != NO_MORE_DOCS) { + // there is a bug somewhere that can result in doc=NO_MORE_DOCS! I think it happens + // in some degenerate case (like input query has NaN in it?) that causes no results to + // be returned from HNSW search? + resultIds[i++] = Integer.parseInt(storedFields.document(doc.doc).get(ID_FIELD)); + } + } + return resultIds; + } + +} diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index c0e180c543a5..cd418d21e05c 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -479,4 +479,6 @@ module org.elasticsearch.server { exports org.elasticsearch.lucene.util.automaton; exports org.elasticsearch.index.codec.perfield; exports org.elasticsearch.lucene.search; + exports org.elasticsearch.index.codec.vectors to org.elasticsearch.test.knn; + exports org.elasticsearch.index.codec.vectors.es818 to org.elasticsearch.test.knn; } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java index 12726836719b..f2145b463ad9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java @@ -51,6 +51,7 @@ public abstract class IVFVectorsReader extends KnnVectorsReader { protected final IntObjectHashMap fields; private final FlatVectorsReader rawVectorsReader; + @SuppressWarnings("this-escape") protected IVFVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader) throws IOException { this.state = state; this.fieldInfos = state.fieldInfos; diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java index 4e9c4ee47e3f..d6188703881a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java @@ -55,6 +55,7 @@ public abstract class IVFVectorsWriter extends KnnVectorsWriter { private final FlatVectorsWriter rawVectorDelegate; private final SegmentWriteState segmentWriteState; + @SuppressWarnings("this-escape") protected IVFVectorsWriter(SegmentWriteState state, FlatVectorsWriter rawVectorDelegate) throws IOException { this.segmentWriteState = state; this.rawVectorDelegate = rawVectorDelegate; diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java index f809fd81bbd5..c86dec80c7ba 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java @@ -69,6 +69,7 @@ public class ES816BinaryQuantizedVectorsReader extends FlatVectorsReader impleme private final FlatVectorsReader rawVectorsReader; private final ES816BinaryFlatVectorsScorer vectorScorer; + @SuppressWarnings("this-escape") ES816BinaryQuantizedVectorsReader( SegmentReadState state, FlatVectorsReader rawVectorsReader, diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java index d1c107ebe15a..bdf1e3c925b2 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java @@ -65,6 +65,7 @@ public class DirectIOLucene99FlatVectorsReader extends FlatVectorsReader impleme private final IndexInput vectorData; private final FieldInfos fieldInfos; + @SuppressWarnings("this-escape") public DirectIOLucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer) throws IOException { super(scorer); int versionMeta = readMetadata(state); diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java index ac707d155ea3..333f47a284da 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java @@ -70,6 +70,7 @@ public class ES818BinaryQuantizedVectorsReader extends FlatVectorsReader impleme private final FlatVectorsReader rawVectorsReader; private final ES818BinaryFlatVectorsScorer vectorScorer; + @SuppressWarnings("this-escape") ES818BinaryQuantizedVectorsReader( SegmentReadState state, FlatVectorsReader rawVectorsReader, diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsWriter.java index 7cfa755c2610..a4983e234c8d 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsWriter.java @@ -84,6 +84,7 @@ public class ES818BinaryQuantizedVectorsWriter extends FlatVectorsWriter { * * @param vectorsScorer the scorer to use for scoring vectors */ + @SuppressWarnings("this-escape") protected ES818BinaryQuantizedVectorsWriter( ES818BinaryFlatVectorsScorer vectorsScorer, FlatVectorsWriter rawVectorDelegate, diff --git a/settings.gradle b/settings.gradle index 61487d204828..6bca35f40bb2 100644 --- a/settings.gradle +++ b/settings.gradle @@ -171,3 +171,5 @@ if (extraProjects.exists()) { addSubProjects('', extraProjectDir) } } + +include 'qa:vector' \ No newline at end of file diff --git a/test/external-modules/build.gradle b/test/external-modules/build.gradle index dfdc47d9f5be..7b317f27f0b8 100644 --- a/test/external-modules/build.gradle +++ b/test/external-modules/build.gradle @@ -8,11 +8,11 @@ */ subprojects { - apply plugin: 'elasticsearch.base-internal-es-plugin' + apply plugin: 'elasticsearch.base-internal-es-plugin' - esplugin { - name = it.name - licenseFile = layout.settingsDirectory.file('licenses/AGPL-3.0+SSPL-1.0+ELASTIC-LICENSE-2.0.txt').asFile - noticeFile = layout.settingsDirectory.file('NOTICE.txt').asFile - } + esplugin { + name = it.name + licenseFile = layout.settingsDirectory.file('licenses/AGPL-3.0+SSPL-1.0+ELASTIC-LICENSE-2.0.txt').asFile + noticeFile = layout.settingsDirectory.file('NOTICE.txt').asFile + } }