mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 01:22:26 -04:00
Vector test tools (#128934)
This adds some testing tools for verifying vector recall and latency directly without having to spin up an entire ES node and running a rally track. Its pretty barebones and takes inspiration from lucene-util, but I wanted access to our own formats and tooling to make our lives easier. Here is an example config file. This will build the initial index, run queries at num_candidates: 50, then again at num_candidates 100 (without reindexing, and re-using the cached nearest neighbors). ``` [{ "doc_vectors" : "path", "query_vectors" : "path", "num_docs" : 10000, "num_queries" : 10, "index_type" : "hnsw", "num_candidates" : 50, "k" : 10, "hnsw_m" : 16, "hnsw_ef_construction" : 200, "index_threads" : 4, "reindex" : true, "force_merge" : false, "vector_space" : "maximum_inner_product", "dimensions" : 768 }, { "doc_vectors" : "path", "query_vectors" : "path", "num_docs" : 10000, "num_queries" : 10, "index_type" : "hnsw", "num_candidates" : 100, "k" : 10, "hnsw_m" : 16, "hnsw_ef_construction" : 200, "vector_space" : "maximum_inner_product", "dimensions" : 768 } ] ``` To execute: ``` ./gradlew :qa:vector:checkVec --args="/Path/to/knn_tester_config.json" ``` Calling `./gradlew :qa:vector:checkVecHelp` gives some guidance on how to use it, additionally providing a way to run it via java directly (useful to bypass gradlew guff).
This commit is contained in:
parent
ffa8927a9f
commit
155c0da00a
17 changed files with 2312 additions and 6 deletions
101
qa/vector/build.gradle
Normal file
101
qa/vector/build.gradle
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the "Elastic License
|
||||||
|
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||||
|
* Public License v 1"; you may not use this file except in compliance with, at
|
||||||
|
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||||
|
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||||
|
*/
|
||||||
|
|
||||||
|
apply plugin: 'elasticsearch.java'
|
||||||
|
apply plugin: 'elasticsearch.build'
|
||||||
|
|
||||||
|
|
||||||
|
tasks.named("dependencyLicenses").configure {
|
||||||
|
mapping from: /lucene-.*/, to: 'lucene'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.named('forbiddenApisMain').configure {
|
||||||
|
enabled = false
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
api "org.apache.lucene:lucene-core:${versions.lucene}"
|
||||||
|
api "org.apache.lucene:lucene-queries:${versions.lucene}"
|
||||||
|
api "org.apache.lucene:lucene-codecs:${versions.lucene}"
|
||||||
|
implementation project(':libs:logging')
|
||||||
|
implementation project(':server')
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Task to run the KnnIndexTester with the provided parameters.
|
||||||
|
*/
|
||||||
|
tasks.register("checkVec", JavaExec) {
|
||||||
|
group = "Execution"
|
||||||
|
description = "Runs KnnIndexTester with the provided parameters to validate recall and performance."
|
||||||
|
classpath = sourceSets.main.runtimeClasspath
|
||||||
|
mainClass.set("org.elasticsearch.test.knn.KnnIndexTester")
|
||||||
|
// Configure logging to console
|
||||||
|
systemProperty "es.logger.out", "console"
|
||||||
|
systemProperty "es.logger.level", "INFO" // Change to DEBUG if needed
|
||||||
|
|
||||||
|
if (buildParams.getRuntimeJavaVersion().map { it.majorVersion.toInteger() }.get() >= 21) {
|
||||||
|
jvmArgs '-Xms4g', '-Xmx4g', '--add-modules=jdk.incubator.vector', '--enable-native-access=ALL-UNNAMED', '-Djava.util.concurrent.ForkJoinPool.common.parallelism=8', '-XX:+UnlockDiagnosticVMOptions', '-XX:+DebugNonSafepoints', '-XX:+HeapDumpOnOutOfMemoryError'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.register("checkVecHelp", JavaExec) {
|
||||||
|
group = "Help"
|
||||||
|
description = "Prints help for the KnnIndexTester task."
|
||||||
|
classpath = sourceSets.main.runtimeClasspath
|
||||||
|
mainClass.set("org.elasticsearch.test.knn.KnnIndexTester")
|
||||||
|
args = ["--help"]
|
||||||
|
doLast {
|
||||||
|
println """
|
||||||
|
=============================================================================
|
||||||
|
KnnIndexTester Help
|
||||||
|
=============================================================================
|
||||||
|
|
||||||
|
Run with Gradle:
|
||||||
|
----------------
|
||||||
|
# Using default configuration file
|
||||||
|
./gradlew :qa:vector:checkVec
|
||||||
|
|
||||||
|
# Using custom configuration file
|
||||||
|
./gradlew :qa:vector:checkVec --args="path/to/your/config.json"
|
||||||
|
|
||||||
|
# Adjust heap size
|
||||||
|
./gradlew :qa:vector:checkVec -Dorg.gradle.jvmargs="-Xmx8g" --args="path/to/your/config.json"
|
||||||
|
|
||||||
|
# Set environment variable for more extensive JVM options
|
||||||
|
export GRADLE_OPTS="-Xmx8g -XX:+UseG1GC -XX:MaxGCPauseMillis=100"
|
||||||
|
./gradlew :qa:vector:checkVec
|
||||||
|
|
||||||
|
|
||||||
|
Run directly with Java:
|
||||||
|
----------------------
|
||||||
|
# Generate classpath (run once to create the file)
|
||||||
|
./gradlew :qa:vector:printClasspath
|
||||||
|
|
||||||
|
# Then use the classpath file with java
|
||||||
|
java -cp "\$(cat build/vector_classpath.txt)" \\
|
||||||
|
--add-modules=jdk.incubator.vector \\
|
||||||
|
--enable-native-access=ALL-UNNAMED \\
|
||||||
|
-Djava.util.concurrent.ForkJoinPool.common.parallelism=8 \\
|
||||||
|
-Xmx4g \\
|
||||||
|
-Xms4g \\\\
|
||||||
|
org.elasticsearch.test.knn.KnnIndexTester path/to/your/config.json
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.register("printClasspath") {
|
||||||
|
group = "Help"
|
||||||
|
description = "Prints the classpath needed to run KnnIndexTester directly with java"
|
||||||
|
|
||||||
|
doLast {
|
||||||
|
def classpathFile = new File("${buildDir}/vector_classpath.txt")
|
||||||
|
classpathFile.parentFile.mkdirs()
|
||||||
|
classpathFile.text = sourceSets.main.runtimeClasspath.asPath
|
||||||
|
println "Classpath written to: ${classpathFile.absolutePath}"
|
||||||
|
}
|
||||||
|
}
|
475
qa/vector/licenses/lucene-LICENSE.txt
Normal file
475
qa/vector/licenses/lucene-LICENSE.txt
Normal file
|
@ -0,0 +1,475 @@
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||||
|
derived from unicode conversion examples available at
|
||||||
|
http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
|
||||||
|
from those sources:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright 2001-2004 Unicode, Inc.
|
||||||
|
*
|
||||||
|
* Disclaimer
|
||||||
|
*
|
||||||
|
* This source code is provided as is by Unicode, Inc. No claims are
|
||||||
|
* made as to fitness for any particular purpose. No warranties of any
|
||||||
|
* kind are expressed or implied. The recipient agrees to determine
|
||||||
|
* applicability of information provided. If this file has been
|
||||||
|
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||||
|
* sole remedy for any claim will be exchange of defective media
|
||||||
|
* within 90 days of receipt.
|
||||||
|
*
|
||||||
|
* Limitations on Rights to Redistribute This Code
|
||||||
|
*
|
||||||
|
* Unicode, Inc. hereby grants the right to freely use the information
|
||||||
|
* supplied in this file in the creation of products supporting the
|
||||||
|
* Unicode Standard, and to make copies of this file in any form
|
||||||
|
* for internal or external distribution as long as this notice
|
||||||
|
* remains attached.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was
|
||||||
|
derived from Python 2.4.2 sources available at
|
||||||
|
http://www.python.org. Full license is here:
|
||||||
|
|
||||||
|
http://www.python.org/download/releases/2.4.2/license/
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||||
|
derived from Python 3.1.2 sources available at
|
||||||
|
http://www.python.org. Full license is here:
|
||||||
|
|
||||||
|
http://www.python.org/download/releases/3.1.2/license/
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/automaton was
|
||||||
|
derived from Brics automaton sources available at
|
||||||
|
www.brics.dk/automaton/. Here is the copyright from those sources:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2001-2009 Anders Moeller
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* 3. The name of the author may not be used to endorse or promote products
|
||||||
|
* derived from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||||
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||||
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||||
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton
|
||||||
|
were automatically generated with the moman/finenight FSA package.
|
||||||
|
Here is the copyright for those sources:
|
||||||
|
|
||||||
|
# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, <jpb@rrette.com>
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person
|
||||||
|
# obtaining a copy of this software and associated documentation
|
||||||
|
# files (the "Software"), to deal in the Software without
|
||||||
|
# restriction, including without limitation the rights to use,
|
||||||
|
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
# copies of the Software, and to permit persons to whom the
|
||||||
|
# Software is furnished to do so, subject to the following
|
||||||
|
# conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be
|
||||||
|
# included in all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
# OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||||
|
derived from ICU (http://www.icu-project.org)
|
||||||
|
The full license is available here:
|
||||||
|
http://source.icu-project.org/repos/icu/icu/trunk/license.html
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (C) 1999-2010, International Business Machines
|
||||||
|
* Corporation and others. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||||
|
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
* provided that the above copyright notice(s) and this permission notice appear
|
||||||
|
* in all copies of the Software and that both the above copyright notice(s) and
|
||||||
|
* this permission notice appear in supporting documentation.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||||
|
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
|
||||||
|
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
|
||||||
|
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
||||||
|
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||||
|
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
*
|
||||||
|
* Except as contained in this notice, the name of a copyright holder shall not
|
||||||
|
* be used in advertising or otherwise to promote the sale, use or other
|
||||||
|
* dealings in this Software without prior written authorization of the
|
||||||
|
* copyright holder.
|
||||||
|
*/
|
||||||
|
|
||||||
|
The following license applies to the Snowball stemmers:
|
||||||
|
|
||||||
|
Copyright (c) 2001, Dr Martin Porter
|
||||||
|
Copyright (c) 2002, Richard Boulton
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holders nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||||
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
The following license applies to the KStemmer:
|
||||||
|
|
||||||
|
Copyright © 2003,
|
||||||
|
Center for Intelligent Information Retrieval,
|
||||||
|
University of Massachusetts, Amherst.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright notice, this
|
||||||
|
list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
3. The names "Center for Intelligent Information Retrieval" and
|
||||||
|
"University of Massachusetts" must not be used to endorse or promote products
|
||||||
|
derived from this software without prior written permission. To obtain
|
||||||
|
permission, contact info@ciir.cs.umass.edu.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
SUCH DAMAGE.
|
||||||
|
|
||||||
|
The following license applies to the Morfologik project:
|
||||||
|
|
||||||
|
Copyright (c) 2006 Dawid Weiss
|
||||||
|
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Morfologik nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The dictionary comes from Morfologik project. Morfologik uses data from
|
||||||
|
Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
|
||||||
|
is licenced on the terms of (inter alia) LGPL and Creative Commons
|
||||||
|
ShareAlike. The part-of-speech tags were added in Morfologik project and
|
||||||
|
are not found in the data from sjp.pl. The tagset is similar to IPI PAN
|
||||||
|
tagset.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The following license applies to the Morfeusz project,
|
||||||
|
used by org.apache.lucene.analysis.morfologik.
|
||||||
|
|
||||||
|
BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
http://sgjp.pl/morfeusz/
|
||||||
|
|
||||||
|
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
|
||||||
|
Marcin Woliński, Robert Wołosz
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||||
|
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
|
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
192
qa/vector/licenses/lucene-NOTICE.txt
Normal file
192
qa/vector/licenses/lucene-NOTICE.txt
Normal file
|
@ -0,0 +1,192 @@
|
||||||
|
Apache Lucene
|
||||||
|
Copyright 2014 The Apache Software Foundation
|
||||||
|
|
||||||
|
This product includes software developed at
|
||||||
|
The Apache Software Foundation (http://www.apache.org/).
|
||||||
|
|
||||||
|
Includes software from other Apache Software Foundation projects,
|
||||||
|
including, but not limited to:
|
||||||
|
- Apache Ant
|
||||||
|
- Apache Jakarta Regexp
|
||||||
|
- Apache Commons
|
||||||
|
- Apache Xerces
|
||||||
|
|
||||||
|
ICU4J, (under analysis/icu) is licensed under an MIT styles license
|
||||||
|
and Copyright (c) 1995-2008 International Business Machines Corporation and others
|
||||||
|
|
||||||
|
Some data files (under analysis/icu/src/data) are derived from Unicode data such
|
||||||
|
as the Unicode Character Database. See http://unicode.org/copyright.html for more
|
||||||
|
details.
|
||||||
|
|
||||||
|
Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
|
||||||
|
BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
|
||||||
|
|
||||||
|
The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
|
||||||
|
automatically generated with the moman/finenight FSA library, created by
|
||||||
|
Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
|
||||||
|
see http://sites.google.com/site/rrettesite/moman and
|
||||||
|
http://bitbucket.org/jpbarrette/moman/overview/
|
||||||
|
|
||||||
|
The class org.apache.lucene.util.WeakIdentityMap was derived from
|
||||||
|
the Apache CXF project and is Apache License 2.0.
|
||||||
|
|
||||||
|
The Google Code Prettify is Apache License 2.0.
|
||||||
|
See http://code.google.com/p/google-code-prettify/
|
||||||
|
|
||||||
|
JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
|
||||||
|
See http://junit.sourceforge.net/cpl-v10.html
|
||||||
|
|
||||||
|
This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
|
||||||
|
g Package (jaspell): http://jaspell.sourceforge.net/
|
||||||
|
License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
|
||||||
|
|
||||||
|
The snowball stemmers in
|
||||||
|
analysis/common/src/java/net/sf/snowball
|
||||||
|
were developed by Martin Porter and Richard Boulton.
|
||||||
|
The snowball stopword lists in
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/snowball
|
||||||
|
were developed by Martin Porter and Richard Boulton.
|
||||||
|
The full snowball package is available from
|
||||||
|
http://snowball.tartarus.org/
|
||||||
|
|
||||||
|
The KStem stemmer in
|
||||||
|
analysis/common/src/org/apache/lucene/analysis/en
|
||||||
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
|
under the BSD-license.
|
||||||
|
|
||||||
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
(common) are based on BSD-licensed reference implementations created by Jacques Savoy and
|
||||||
|
Ljiljana Dolamic. These files reside in:
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
|
||||||
|
analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
|
||||||
|
|
||||||
|
The Stempel analyzer (stempel) includes BSD-licensed software developed
|
||||||
|
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
|
||||||
|
and Edmond Nolan.
|
||||||
|
|
||||||
|
The Polish analyzer (stempel) comes with a default
|
||||||
|
stopword list that is BSD-licensed created by the Carrot2 project. The file resides
|
||||||
|
in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
|
||||||
|
See http://project.carrot2.org/license.html.
|
||||||
|
|
||||||
|
The SmartChineseAnalyzer source code (smartcn) was
|
||||||
|
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
|
||||||
|
|
||||||
|
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
||||||
|
is derived from Unicode data such as the Unicode Character Database.
|
||||||
|
See http://unicode.org/copyright.html for more details.
|
||||||
|
|
||||||
|
The Morfologik analyzer (morfologik) includes BSD-licensed software
|
||||||
|
developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
|
||||||
|
|
||||||
|
Morfologik uses data from Polish ispell/myspell dictionary
|
||||||
|
(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
|
||||||
|
LGPL and Creative Commons ShareAlike.
|
||||||
|
|
||||||
|
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
(http://sgjp.pl/morfeusz/)
|
||||||
|
|
||||||
|
Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
|
||||||
|
source code for this can be found at http://www.eclipse.org/jetty/downloads.php
|
||||||
|
|
||||||
|
===========================================================================
|
||||||
|
Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
|
||||||
|
===========================================================================
|
||||||
|
|
||||||
|
This software includes a binary and/or source version of data from
|
||||||
|
|
||||||
|
mecab-ipadic-2.7.0-20070801
|
||||||
|
|
||||||
|
which can be obtained from
|
||||||
|
|
||||||
|
http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
|
||||||
|
|
||||||
|
===========================================================================
|
||||||
|
mecab-ipadic-2.7.0-20070801 Notice
|
||||||
|
===========================================================================
|
||||||
|
|
||||||
|
Nara Institute of Science and Technology (NAIST),
|
||||||
|
the copyright holders, disclaims all warranties with regard to this
|
||||||
|
software, including all implied warranties of merchantability and
|
||||||
|
fitness, in no event shall NAIST be liable for
|
||||||
|
any special, indirect or consequential damages or any damages
|
||||||
|
whatsoever resulting from loss of use, data or profits, whether in an
|
||||||
|
action of contract, negligence or other tortuous action, arising out
|
||||||
|
of or in connection with the use or performance of this software.
|
||||||
|
|
||||||
|
A large portion of the dictionary entries
|
||||||
|
originate from ICOT Free Software. The following conditions for ICOT
|
||||||
|
Free Software applies to the current dictionary as well.
|
||||||
|
|
||||||
|
Each User may also freely distribute the Program, whether in its
|
||||||
|
original form or modified, to any third party or parties, PROVIDED
|
||||||
|
that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
|
||||||
|
on, or be attached to, the Program, which is distributed substantially
|
||||||
|
in the same form as set out herein and that such intended
|
||||||
|
distribution, if actually made, will neither violate or otherwise
|
||||||
|
contravene any of the laws and regulations of the countries having
|
||||||
|
jurisdiction over the User or the intended distribution itself.
|
||||||
|
|
||||||
|
NO WARRANTY
|
||||||
|
|
||||||
|
The program was produced on an experimental basis in the course of the
|
||||||
|
research and development conducted during the project and is provided
|
||||||
|
to users as so produced on an experimental basis. Accordingly, the
|
||||||
|
program is provided without any warranty whatsoever, whether express,
|
||||||
|
implied, statutory or otherwise. The term "warranty" used herein
|
||||||
|
includes, but is not limited to, any warranty of the quality,
|
||||||
|
performance, merchantability and fitness for a particular purpose of
|
||||||
|
the program and the nonexistence of any infringement or violation of
|
||||||
|
any right of any third party.
|
||||||
|
|
||||||
|
Each user of the program will agree and understand, and be deemed to
|
||||||
|
have agreed and understood, that there is no warranty whatsoever for
|
||||||
|
the program and, accordingly, the entire risk arising from or
|
||||||
|
otherwise connected with the program is assumed by the user.
|
||||||
|
|
||||||
|
Therefore, neither ICOT, the copyright holder, or any other
|
||||||
|
organization that participated in or was otherwise related to the
|
||||||
|
development of the program and their respective officials, directors,
|
||||||
|
officers and other employees shall be held liable for any and all
|
||||||
|
damages, including, without limitation, general, special, incidental
|
||||||
|
and consequential damages, arising out of or otherwise in connection
|
||||||
|
with the use or inability to use the program or any product, material
|
||||||
|
or result produced or otherwise obtained by using the program,
|
||||||
|
regardless of whether they have been advised of, or otherwise had
|
||||||
|
knowledge of, the possibility of such damages at any time during the
|
||||||
|
project or thereafter. Each user will be deemed to have agreed to the
|
||||||
|
foregoing by his or her commencement of use of the program. The term
|
||||||
|
"use" as used herein includes, but is not limited to, the use,
|
||||||
|
modification, copying and distribution of the program and the
|
||||||
|
production of secondary products from the program.
|
||||||
|
|
||||||
|
In the case where the program, whether in its original form or
|
||||||
|
modified, was distributed or delivered to or received by a user from
|
||||||
|
any person, organization or entity other than ICOT, unless it makes or
|
||||||
|
grants independently of ICOT any specific warranty to the user in
|
||||||
|
writing, such person, organization or entity, will also be exempted
|
||||||
|
from and not be held liable to the user for any such damages as noted
|
||||||
|
above as far as the program is concerned.
|
20
qa/vector/src/main/java/module-info.java
Normal file
20
qa/vector/src/main/java/module-info.java
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the "Elastic License
|
||||||
|
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||||
|
* Public License v 1"; you may not use this file except in compliance with, at
|
||||||
|
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||||
|
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||||
|
*/
|
||||||
|
|
||||||
|
module org.elasticsearch.test.knn {
|
||||||
|
requires org.elasticsearch.base;
|
||||||
|
requires org.elasticsearch.server;
|
||||||
|
requires org.elasticsearch.xcontent;
|
||||||
|
requires org.apache.lucene.core;
|
||||||
|
requires org.apache.lucene.codecs;
|
||||||
|
requires org.apache.lucene.queries;
|
||||||
|
requires org.elasticsearch.logging;
|
||||||
|
requires java.management;
|
||||||
|
requires jdk.management;
|
||||||
|
}
|
|
@ -0,0 +1,292 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the "Elastic License
|
||||||
|
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||||
|
* Public License v 1"; you may not use this file except in compliance with, at
|
||||||
|
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||||
|
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.test.knn;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.elasticsearch.common.Strings;
|
||||||
|
import org.elasticsearch.core.PathUtils;
|
||||||
|
import org.elasticsearch.xcontent.ObjectParser;
|
||||||
|
import org.elasticsearch.xcontent.ParseField;
|
||||||
|
import org.elasticsearch.xcontent.ToXContentObject;
|
||||||
|
import org.elasticsearch.xcontent.XContentBuilder;
|
||||||
|
import org.elasticsearch.xcontent.XContentParser;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Command line arguments for the KNN index tester.
|
||||||
|
* This class encapsulates all the parameters required to run the KNN index tests.
|
||||||
|
*/
|
||||||
|
record CmdLineArgs(
|
||||||
|
Path docVectors,
|
||||||
|
Path queryVectors,
|
||||||
|
int numDocs,
|
||||||
|
int numQueries,
|
||||||
|
KnnIndexTester.IndexType indexType,
|
||||||
|
int numCandidates,
|
||||||
|
int k,
|
||||||
|
int nProbe,
|
||||||
|
int ivfClusterSize,
|
||||||
|
int overSamplingFactor,
|
||||||
|
int hnswM,
|
||||||
|
int hnswEfConstruction,
|
||||||
|
int searchThreads,
|
||||||
|
int indexThreads,
|
||||||
|
boolean reindex,
|
||||||
|
boolean forceMerge,
|
||||||
|
VectorSimilarityFunction vectorSpace,
|
||||||
|
int quantizeBits,
|
||||||
|
VectorEncoding vectorEncoding,
|
||||||
|
int dimensions
|
||||||
|
) implements ToXContentObject {
|
||||||
|
|
||||||
|
static final ParseField DOC_VECTORS_FIELD = new ParseField("doc_vectors");
|
||||||
|
static final ParseField QUERY_VECTORS_FIELD = new ParseField("query_vectors");
|
||||||
|
static final ParseField NUM_DOCS_FIELD = new ParseField("num_docs");
|
||||||
|
static final ParseField NUM_QUERIES_FIELD = new ParseField("num_queries");
|
||||||
|
static final ParseField INDEX_TYPE_FIELD = new ParseField("index_type");
|
||||||
|
static final ParseField NUM_CANDIDATES_FIELD = new ParseField("num_candidates");
|
||||||
|
static final ParseField K_FIELD = new ParseField("k");
|
||||||
|
static final ParseField N_PROBE_FIELD = new ParseField("n_probe");
|
||||||
|
static final ParseField IVF_CLUSTER_SIZE_FIELD = new ParseField("ivf_cluster_size");
|
||||||
|
static final ParseField OVER_SAMPLING_FACTOR_FIELD = new ParseField("over_sampling_factor");
|
||||||
|
static final ParseField HNSW_M_FIELD = new ParseField("hnsw_m");
|
||||||
|
static final ParseField HNSW_EF_CONSTRUCTION_FIELD = new ParseField("hnsw_ef_construction");
|
||||||
|
static final ParseField SEARCH_THREADS_FIELD = new ParseField("search_threads");
|
||||||
|
static final ParseField INDEX_THREADS_FIELD = new ParseField("index_threads");
|
||||||
|
static final ParseField REINDEX_FIELD = new ParseField("reindex");
|
||||||
|
static final ParseField FORCE_MERGE_FIELD = new ParseField("force_merge");
|
||||||
|
static final ParseField VECTOR_SPACE_FIELD = new ParseField("vector_space");
|
||||||
|
static final ParseField QUANTIZE_BITS_FIELD = new ParseField("quantize_bits");
|
||||||
|
static final ParseField VECTOR_ENCODING_FIELD = new ParseField("vector_encoding");
|
||||||
|
static final ParseField DIMENSIONS_FIELD = new ParseField("dimensions");
|
||||||
|
|
||||||
|
static CmdLineArgs fromXContent(XContentParser parser) throws IOException {
|
||||||
|
Builder builder = PARSER.apply(parser, null);
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
static final ObjectParser<CmdLineArgs.Builder, Void> PARSER = new ObjectParser<>("cmd_line_args", true, Builder::new);
|
||||||
|
|
||||||
|
static {
|
||||||
|
PARSER.declareString(Builder::setDocVectors, DOC_VECTORS_FIELD);
|
||||||
|
PARSER.declareString(Builder::setQueryVectors, QUERY_VECTORS_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setNumDocs, NUM_DOCS_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setNumQueries, NUM_QUERIES_FIELD);
|
||||||
|
PARSER.declareString(Builder::setIndexType, INDEX_TYPE_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setNumCandidates, NUM_CANDIDATES_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setK, K_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setNProbe, N_PROBE_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setIvfClusterSize, IVF_CLUSTER_SIZE_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setOverSamplingFactor, OVER_SAMPLING_FACTOR_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setHnswM, HNSW_M_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setHnswEfConstruction, HNSW_EF_CONSTRUCTION_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setSearchThreads, SEARCH_THREADS_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setIndexThreads, INDEX_THREADS_FIELD);
|
||||||
|
PARSER.declareBoolean(Builder::setReindex, REINDEX_FIELD);
|
||||||
|
PARSER.declareBoolean(Builder::setForceMerge, FORCE_MERGE_FIELD);
|
||||||
|
PARSER.declareString(Builder::setVectorSpace, VECTOR_SPACE_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setQuantizeBits, QUANTIZE_BITS_FIELD);
|
||||||
|
PARSER.declareString(Builder::setVectorEncoding, VECTOR_ENCODING_FIELD);
|
||||||
|
PARSER.declareInt(Builder::setDimensions, DIMENSIONS_FIELD);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||||
|
builder.startObject();
|
||||||
|
if (docVectors != null) {
|
||||||
|
builder.field(DOC_VECTORS_FIELD.getPreferredName(), docVectors.toString());
|
||||||
|
}
|
||||||
|
if (queryVectors != null) {
|
||||||
|
builder.field(QUERY_VECTORS_FIELD.getPreferredName(), queryVectors.toString());
|
||||||
|
}
|
||||||
|
builder.field(NUM_DOCS_FIELD.getPreferredName(), numDocs);
|
||||||
|
builder.field(NUM_QUERIES_FIELD.getPreferredName(), numQueries);
|
||||||
|
builder.field(INDEX_TYPE_FIELD.getPreferredName(), indexType.name().toLowerCase(Locale.ROOT));
|
||||||
|
builder.field(NUM_CANDIDATES_FIELD.getPreferredName(), numCandidates);
|
||||||
|
builder.field(K_FIELD.getPreferredName(), k);
|
||||||
|
builder.field(N_PROBE_FIELD.getPreferredName(), nProbe);
|
||||||
|
builder.field(IVF_CLUSTER_SIZE_FIELD.getPreferredName(), ivfClusterSize);
|
||||||
|
builder.field(OVER_SAMPLING_FACTOR_FIELD.getPreferredName(), overSamplingFactor);
|
||||||
|
builder.field(HNSW_M_FIELD.getPreferredName(), hnswM);
|
||||||
|
builder.field(HNSW_EF_CONSTRUCTION_FIELD.getPreferredName(), hnswEfConstruction);
|
||||||
|
builder.field(SEARCH_THREADS_FIELD.getPreferredName(), searchThreads);
|
||||||
|
builder.field(INDEX_THREADS_FIELD.getPreferredName(), indexThreads);
|
||||||
|
builder.field(REINDEX_FIELD.getPreferredName(), reindex);
|
||||||
|
builder.field(FORCE_MERGE_FIELD.getPreferredName(), forceMerge);
|
||||||
|
builder.field(VECTOR_SPACE_FIELD.getPreferredName(), vectorSpace.name().toLowerCase(Locale.ROOT));
|
||||||
|
builder.field(QUANTIZE_BITS_FIELD.getPreferredName(), quantizeBits);
|
||||||
|
builder.field(VECTOR_ENCODING_FIELD.getPreferredName(), vectorEncoding.name().toLowerCase(Locale.ROOT));
|
||||||
|
builder.field(DIMENSIONS_FIELD.getPreferredName(), dimensions);
|
||||||
|
return builder.endObject();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return Strings.toString(this, false, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
static class Builder {
|
||||||
|
private Path docVectors;
|
||||||
|
private Path queryVectors;
|
||||||
|
private int numDocs = 1000;
|
||||||
|
private int numQueries = 100;
|
||||||
|
private KnnIndexTester.IndexType indexType = KnnIndexTester.IndexType.HNSW;
|
||||||
|
private int numCandidates = 1000;
|
||||||
|
private int k = 10;
|
||||||
|
private int nProbe = 10;
|
||||||
|
private int ivfClusterSize = 1000;
|
||||||
|
private int overSamplingFactor = 1;
|
||||||
|
private int hnswM = 16;
|
||||||
|
private int hnswEfConstruction = 200;
|
||||||
|
private int searchThreads = 1;
|
||||||
|
private int indexThreads = 1;
|
||||||
|
private boolean reindex = false;
|
||||||
|
private boolean forceMerge = false;
|
||||||
|
private VectorSimilarityFunction vectorSpace = VectorSimilarityFunction.EUCLIDEAN;
|
||||||
|
private int quantizeBits = 8;
|
||||||
|
private VectorEncoding vectorEncoding = VectorEncoding.FLOAT32;
|
||||||
|
private int dimensions;
|
||||||
|
|
||||||
|
public Builder setDocVectors(String docVectors) {
|
||||||
|
this.docVectors = PathUtils.get(docVectors);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setQueryVectors(String queryVectors) {
|
||||||
|
this.queryVectors = PathUtils.get(queryVectors);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setNumDocs(int numDocs) {
|
||||||
|
this.numDocs = numDocs;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setNumQueries(int numQueries) {
|
||||||
|
this.numQueries = numQueries;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setIndexType(String indexType) {
|
||||||
|
this.indexType = KnnIndexTester.IndexType.valueOf(indexType.toUpperCase(Locale.ROOT));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setNumCandidates(int numCandidates) {
|
||||||
|
this.numCandidates = numCandidates;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setK(int k) {
|
||||||
|
this.k = k;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setNProbe(int nProbe) {
|
||||||
|
this.nProbe = nProbe;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setIvfClusterSize(int ivfClusterSize) {
|
||||||
|
this.ivfClusterSize = ivfClusterSize;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setOverSamplingFactor(int overSamplingFactor) {
|
||||||
|
this.overSamplingFactor = overSamplingFactor;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setHnswM(int hnswM) {
|
||||||
|
this.hnswM = hnswM;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setHnswEfConstruction(int hnswEfConstruction) {
|
||||||
|
this.hnswEfConstruction = hnswEfConstruction;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setSearchThreads(int searchThreads) {
|
||||||
|
this.searchThreads = searchThreads;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setIndexThreads(int indexThreads) {
|
||||||
|
this.indexThreads = indexThreads;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setReindex(boolean reindex) {
|
||||||
|
this.reindex = reindex;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setForceMerge(boolean forceMerge) {
|
||||||
|
this.forceMerge = forceMerge;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setVectorSpace(String vectorSpace) {
|
||||||
|
this.vectorSpace = VectorSimilarityFunction.valueOf(vectorSpace.toUpperCase(Locale.ROOT));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setQuantizeBits(int quantizeBits) {
|
||||||
|
this.quantizeBits = quantizeBits;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setVectorEncoding(String vectorEncoding) {
|
||||||
|
this.vectorEncoding = VectorEncoding.valueOf(vectorEncoding.toUpperCase(Locale.ROOT));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder setDimensions(int dimensions) {
|
||||||
|
this.dimensions = dimensions;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CmdLineArgs build() {
|
||||||
|
if (docVectors == null) {
|
||||||
|
throw new IllegalArgumentException("Document vectors path must be provided");
|
||||||
|
}
|
||||||
|
if (dimensions <= 0) {
|
||||||
|
throw new IllegalArgumentException("dimensions must be a positive integer");
|
||||||
|
}
|
||||||
|
return new CmdLineArgs(
|
||||||
|
docVectors,
|
||||||
|
queryVectors,
|
||||||
|
numDocs,
|
||||||
|
numQueries,
|
||||||
|
indexType,
|
||||||
|
numCandidates,
|
||||||
|
k,
|
||||||
|
nProbe,
|
||||||
|
ivfClusterSize,
|
||||||
|
overSamplingFactor,
|
||||||
|
hnswM,
|
||||||
|
hnswEfConstruction,
|
||||||
|
searchThreads,
|
||||||
|
indexThreads,
|
||||||
|
reindex,
|
||||||
|
forceMerge,
|
||||||
|
vectorSpace,
|
||||||
|
quantizeBits,
|
||||||
|
vectorEncoding,
|
||||||
|
dimensions
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,399 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the "Elastic License
|
||||||
|
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||||
|
* Public License v 1"; you may not use this file except in compliance with, at
|
||||||
|
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||||
|
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.test.knn;
|
||||||
|
|
||||||
|
import com.sun.management.ThreadMXBean;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||||
|
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||||
|
import org.elasticsearch.common.Strings;
|
||||||
|
import org.elasticsearch.common.logging.LogConfigurator;
|
||||||
|
import org.elasticsearch.core.PathUtils;
|
||||||
|
import org.elasticsearch.index.codec.vectors.ES813Int8FlatVectorFormat;
|
||||||
|
import org.elasticsearch.index.codec.vectors.ES814HnswScalarQuantizedVectorsFormat;
|
||||||
|
import org.elasticsearch.index.codec.vectors.IVFVectorsFormat;
|
||||||
|
import org.elasticsearch.index.codec.vectors.es818.ES818BinaryQuantizedVectorsFormat;
|
||||||
|
import org.elasticsearch.index.codec.vectors.es818.ES818HnswBinaryQuantizedVectorsFormat;
|
||||||
|
import org.elasticsearch.logging.Level;
|
||||||
|
import org.elasticsearch.xcontent.XContentParser;
|
||||||
|
import org.elasticsearch.xcontent.XContentParserConfiguration;
|
||||||
|
import org.elasticsearch.xcontent.XContentType;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.lang.management.ThreadInfo;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A utility class to create and test KNN indices using Lucene.
|
||||||
|
* It supports various index types (HNSW, FLAT, IVF) and configurations.
|
||||||
|
*/
|
||||||
|
public class KnnIndexTester {
|
||||||
|
static final Level LOG_LEVEL = Level.DEBUG;
|
||||||
|
|
||||||
|
static final SysOutLogger logger = new SysOutLogger();
|
||||||
|
|
||||||
|
static {
|
||||||
|
LogConfigurator.loadLog4jPlugins();
|
||||||
|
LogConfigurator.configureESLogging(); // native access requires logging to be initialized
|
||||||
|
}
|
||||||
|
|
||||||
|
static final String INDEX_DIR = "target/knn_index";
|
||||||
|
|
||||||
|
enum IndexType {
|
||||||
|
HNSW,
|
||||||
|
FLAT,
|
||||||
|
IVF
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String formatIndexPath(CmdLineArgs args) {
|
||||||
|
List<String> suffix = new ArrayList<>();
|
||||||
|
if (args.indexType() == IndexType.FLAT) {
|
||||||
|
suffix.add("flat");
|
||||||
|
} else if (args.indexType() == IndexType.IVF) {
|
||||||
|
suffix.add("ivf");
|
||||||
|
suffix.add(Integer.toString(args.ivfClusterSize()));
|
||||||
|
} else {
|
||||||
|
suffix.add(Integer.toString(args.hnswM()));
|
||||||
|
suffix.add(Integer.toString(args.hnswEfConstruction()));
|
||||||
|
if (args.quantizeBits() < 32) {
|
||||||
|
suffix.add(Integer.toString(args.quantizeBits()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return INDEX_DIR + "/" + args.docVectors().getFileName() + "-" + String.join("-", suffix) + ".index";
|
||||||
|
}
|
||||||
|
|
||||||
|
static Codec createCodec(CmdLineArgs args) {
|
||||||
|
final KnnVectorsFormat format;
|
||||||
|
if (args.indexType() == IndexType.IVF) {
|
||||||
|
format = new IVFVectorsFormat(args.ivfClusterSize());
|
||||||
|
} else {
|
||||||
|
if (args.quantizeBits() == 1) {
|
||||||
|
if (args.indexType() == IndexType.FLAT) {
|
||||||
|
format = new ES818BinaryQuantizedVectorsFormat();
|
||||||
|
} else {
|
||||||
|
format = new ES818HnswBinaryQuantizedVectorsFormat(args.hnswM(), args.hnswEfConstruction(), 1, null);
|
||||||
|
}
|
||||||
|
} else if (args.quantizeBits() < 32) {
|
||||||
|
if (args.indexType() == IndexType.FLAT) {
|
||||||
|
format = new ES813Int8FlatVectorFormat(null, args.quantizeBits(), true);
|
||||||
|
} else {
|
||||||
|
format = new ES814HnswScalarQuantizedVectorsFormat(
|
||||||
|
args.hnswM(),
|
||||||
|
args.hnswEfConstruction(),
|
||||||
|
null,
|
||||||
|
args.quantizeBits(),
|
||||||
|
true
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
format = new Lucene99HnswVectorsFormat(args.hnswM(), args.hnswEfConstruction(), 1, null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new Lucene101Codec() {
|
||||||
|
@Override
|
||||||
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
|
return format;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main method to run the KNN index tester.
|
||||||
|
* It parses command line arguments, creates the index, and runs searches if specified.
|
||||||
|
*
|
||||||
|
* @param args Command line arguments
|
||||||
|
* @throws Exception If an error occurs during index creation or search
|
||||||
|
*/
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if (args.length != 1 || args[0].equals("--help") || args[0].equals("-h")) {
|
||||||
|
// printout an example configuration formatted file and indicate that it is required
|
||||||
|
System.out.println("Usage: java -cp <your-classpath> org.elasticsearch.test.knn.KnnIndexTester <config-file>");
|
||||||
|
System.out.println("Where <config-file> is a JSON file containing one or more configurations for the KNN index tester.");
|
||||||
|
System.out.println("An example configuration object: ");
|
||||||
|
System.out.println(
|
||||||
|
Strings.toString(
|
||||||
|
new CmdLineArgs.Builder().setDimensions(64)
|
||||||
|
.setDocVectors("/doc/vectors/path")
|
||||||
|
.setQueryVectors("/query/vectors/path")
|
||||||
|
.build(),
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
)
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String jsonConfig = args[0];
|
||||||
|
// Parse command line arguments
|
||||||
|
Path jsonConfigPath = PathUtils.get(jsonConfig);
|
||||||
|
if (Files.exists(jsonConfigPath) == false) {
|
||||||
|
throw new IllegalArgumentException("JSON config file does not exist: " + jsonConfigPath);
|
||||||
|
}
|
||||||
|
// Parse the JSON config file to get command line arguments
|
||||||
|
// This assumes that CmdLineArgs.fromXContent is implemented to parse the JSON file
|
||||||
|
List<CmdLineArgs> cmdLineArgsList = new ArrayList<>();
|
||||||
|
try (
|
||||||
|
InputStream jsonStream = Files.newInputStream(jsonConfigPath);
|
||||||
|
XContentParser parser = XContentType.JSON.xContent().createParser(XContentParserConfiguration.EMPTY, jsonStream)
|
||||||
|
) {
|
||||||
|
// check if the parser is at the start of an object if so, we only have one set of arguments
|
||||||
|
if (parser.currentToken() == null && parser.nextToken() == XContentParser.Token.START_OBJECT) {
|
||||||
|
cmdLineArgsList.add(CmdLineArgs.fromXContent(parser));
|
||||||
|
} else if (parser.currentToken() == XContentParser.Token.START_ARRAY) {
|
||||||
|
// if the parser is at the start of an array, we have multiple sets of arguments
|
||||||
|
while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
|
||||||
|
cmdLineArgsList.add(CmdLineArgs.fromXContent(parser));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("Invalid JSON format in config file: " + jsonConfigPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FormattedResults formattedResults = new FormattedResults();
|
||||||
|
for (CmdLineArgs cmdLineArgs : cmdLineArgsList) {
|
||||||
|
Results result = new Results(cmdLineArgs.indexType().name().toLowerCase(Locale.ROOT), cmdLineArgs.numDocs());
|
||||||
|
System.out.println("Running KNN index tester with arguments: " + cmdLineArgs);
|
||||||
|
Codec codec = createCodec(cmdLineArgs);
|
||||||
|
Path indexPath = PathUtils.get(formatIndexPath(cmdLineArgs));
|
||||||
|
if (cmdLineArgs.reindex() || cmdLineArgs.forceMerge()) {
|
||||||
|
KnnIndexer knnIndexer = new KnnIndexer(
|
||||||
|
cmdLineArgs.docVectors(),
|
||||||
|
indexPath,
|
||||||
|
codec,
|
||||||
|
cmdLineArgs.indexThreads(),
|
||||||
|
cmdLineArgs.vectorEncoding(),
|
||||||
|
cmdLineArgs.dimensions(),
|
||||||
|
cmdLineArgs.vectorSpace(),
|
||||||
|
cmdLineArgs.numDocs()
|
||||||
|
);
|
||||||
|
if (Files.exists(indexPath) == false) {
|
||||||
|
if (cmdLineArgs.reindex() == false) {
|
||||||
|
throw new IllegalArgumentException("Index path does not exist: " + indexPath);
|
||||||
|
}
|
||||||
|
if (cmdLineArgs.forceMerge()) {
|
||||||
|
throw new IllegalArgumentException("Force merging without an existing index in: " + indexPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cmdLineArgs.reindex()) {
|
||||||
|
knnIndexer.createIndex(result);
|
||||||
|
}
|
||||||
|
if (cmdLineArgs.forceMerge()) {
|
||||||
|
knnIndexer.forceMerge(result);
|
||||||
|
} else {
|
||||||
|
knnIndexer.numSegments(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cmdLineArgs.queryVectors() != null) {
|
||||||
|
KnnSearcher knnSearcher = new KnnSearcher(indexPath, cmdLineArgs);
|
||||||
|
knnSearcher.runSearch(result);
|
||||||
|
}
|
||||||
|
formattedResults.results.add(result);
|
||||||
|
}
|
||||||
|
System.out.println("Results:");
|
||||||
|
System.out.println(formattedResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
static class FormattedResults {
|
||||||
|
List<Results> results = new ArrayList<>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
if (results.isEmpty()) {
|
||||||
|
return "No results available.";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define column headers
|
||||||
|
String[] headers = {
|
||||||
|
"index_type",
|
||||||
|
"num_docs",
|
||||||
|
"index_time(ms)",
|
||||||
|
"force_merge_time(ms)",
|
||||||
|
"num_segments",
|
||||||
|
"latency(ms)",
|
||||||
|
"net_cpu_time(ms)",
|
||||||
|
"avg_cpu_count",
|
||||||
|
"QPS",
|
||||||
|
"recall",
|
||||||
|
"visited" };
|
||||||
|
|
||||||
|
// Calculate appropriate column widths based on headers and data
|
||||||
|
int[] widths = calculateColumnWidths(headers);
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
// Format and append header
|
||||||
|
sb.append(formatRow(headers, widths));
|
||||||
|
sb.append("\n");
|
||||||
|
|
||||||
|
// Add separator line
|
||||||
|
for (int width : widths) {
|
||||||
|
sb.append("-".repeat(width)).append(" ");
|
||||||
|
}
|
||||||
|
sb.append("\n");
|
||||||
|
|
||||||
|
// Format and append each row of data
|
||||||
|
for (Results result : results) {
|
||||||
|
String[] rowData = {
|
||||||
|
result.indexType,
|
||||||
|
Integer.toString(result.numDocs),
|
||||||
|
Long.toString(result.indexTimeMS),
|
||||||
|
Long.toString(result.forceMergeTimeMS),
|
||||||
|
Integer.toString(result.numSegments),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.avgLatency),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.netCpuTimeMS),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.avgCpuCount),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.qps),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.avgRecall),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.averageVisited) };
|
||||||
|
sb.append(formatRow(rowData, widths));
|
||||||
|
sb.append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper method to format a single row with proper column widths
|
||||||
|
private String formatRow(String[] values, int[] widths) {
|
||||||
|
StringBuilder row = new StringBuilder();
|
||||||
|
for (int i = 0; i < values.length; i++) {
|
||||||
|
// Left-align text column (index_type), right-align numeric columns
|
||||||
|
String format = (i == 0) ? "%-" + widths[i] + "s" : "%" + widths[i] + "s";
|
||||||
|
row.append(Strings.format(format, values[i]));
|
||||||
|
|
||||||
|
// Add separation between columns
|
||||||
|
if (i < values.length - 1) {
|
||||||
|
row.append(" ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return row.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate appropriate column widths based on headers and data
|
||||||
|
private int[] calculateColumnWidths(String[] headers) {
|
||||||
|
int[] widths = new int[headers.length];
|
||||||
|
|
||||||
|
// Initialize widths with header lengths
|
||||||
|
for (int i = 0; i < headers.length; i++) {
|
||||||
|
widths[i] = headers[i].length();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update widths based on data
|
||||||
|
for (Results result : results) {
|
||||||
|
String[] values = {
|
||||||
|
result.indexType,
|
||||||
|
Integer.toString(result.numDocs),
|
||||||
|
Long.toString(result.indexTimeMS),
|
||||||
|
Long.toString(result.forceMergeTimeMS),
|
||||||
|
Integer.toString(result.numSegments),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.avgLatency),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.netCpuTimeMS),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.avgCpuCount),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.qps),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.avgRecall),
|
||||||
|
String.format(Locale.ROOT, "%.2f", result.averageVisited) };
|
||||||
|
|
||||||
|
for (int i = 0; i < values.length; i++) {
|
||||||
|
widths[i] = Math.max(widths[i], values[i].length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return widths;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class Results {
|
||||||
|
final String indexType;
|
||||||
|
final int numDocs;
|
||||||
|
long indexTimeMS;
|
||||||
|
long forceMergeTimeMS;
|
||||||
|
int numSegments;
|
||||||
|
double avgLatency;
|
||||||
|
double qps;
|
||||||
|
double avgRecall;
|
||||||
|
double averageVisited;
|
||||||
|
double netCpuTimeMS;
|
||||||
|
double avgCpuCount;
|
||||||
|
|
||||||
|
Results(String indexType, int numDocs) {
|
||||||
|
this.indexType = indexType;
|
||||||
|
this.numDocs = numDocs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static final class SysOutLogger {
|
||||||
|
|
||||||
|
void warn(String message) {
|
||||||
|
if (LOG_LEVEL.ordinal() >= Level.WARN.ordinal()) {
|
||||||
|
System.out.println(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void warn(String message, Object... params) {
|
||||||
|
if (LOG_LEVEL.ordinal() >= Level.WARN.ordinal()) {
|
||||||
|
System.out.println(String.format(Locale.ROOT, message, params));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void info(String message) {
|
||||||
|
if (LOG_LEVEL.ordinal() >= Level.INFO.ordinal()) {
|
||||||
|
System.out.println(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void info(String message, Object... params) {
|
||||||
|
if (LOG_LEVEL.ordinal() >= Level.INFO.ordinal()) {
|
||||||
|
System.out.println(String.format(Locale.ROOT, message, params));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void debug(String message) {
|
||||||
|
if (LOG_LEVEL.ordinal() >= Level.DEBUG.ordinal()) {
|
||||||
|
System.out.println(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void debug(String message, Object... params) {
|
||||||
|
if (LOG_LEVEL.ordinal() >= Level.DEBUG.ordinal()) {
|
||||||
|
System.out.println(String.format(Locale.ROOT, message, params));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void trace(String message) {
|
||||||
|
if (LOG_LEVEL == Level.TRACE) {
|
||||||
|
System.out.println(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void trace(String message, Object... params) {
|
||||||
|
if (LOG_LEVEL == Level.TRACE) {
|
||||||
|
System.out.println(String.format(Locale.ROOT, message, params));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static final class ThreadDetails {
|
||||||
|
private static final ThreadMXBean threadBean = (ThreadMXBean) java.lang.management.ManagementFactory.getThreadMXBean();
|
||||||
|
public final long[] threadIDs;
|
||||||
|
public final long[] cpuTimesNS;
|
||||||
|
public final ThreadInfo[] threadInfos;
|
||||||
|
public final long ns;
|
||||||
|
|
||||||
|
ThreadDetails() {
|
||||||
|
ns = System.nanoTime();
|
||||||
|
threadIDs = threadBean.getAllThreadIds();
|
||||||
|
cpuTimesNS = threadBean.getThreadCpuTime(threadIDs);
|
||||||
|
threadInfos = threadBean.getThreadInfo(threadIDs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,329 @@
|
||||||
|
/*
|
||||||
|
* @notice
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
* a copy and modification from Lucene util
|
||||||
|
* Modifications copyright (C) 2025 Elasticsearch B.V.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.test.knn;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.FieldType;
|
||||||
|
import org.apache.lucene.document.KnnByteVectorField;
|
||||||
|
import org.apache.lucene.document.KnnFloatVectorField;
|
||||||
|
import org.apache.lucene.document.StoredField;
|
||||||
|
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.util.PrintStreamInfoStream;
|
||||||
|
import org.elasticsearch.common.io.Channels;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.UncheckedIOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import static org.elasticsearch.test.knn.KnnIndexTester.logger;
|
||||||
|
|
||||||
|
class KnnIndexer {
|
||||||
|
private static final double WRITER_BUFFER_MB = 128;
|
||||||
|
static final String ID_FIELD = "id";
|
||||||
|
static final String VECTOR_FIELD = "vector";
|
||||||
|
|
||||||
|
private final Path docsPath;
|
||||||
|
private final Path indexPath;
|
||||||
|
private final VectorEncoding vectorEncoding;
|
||||||
|
private final int dim;
|
||||||
|
private final VectorSimilarityFunction similarityFunction;
|
||||||
|
private final Codec codec;
|
||||||
|
private final int numDocs;
|
||||||
|
private final int numIndexThreads;
|
||||||
|
|
||||||
|
KnnIndexer(
|
||||||
|
Path docsPath,
|
||||||
|
Path indexPath,
|
||||||
|
Codec codec,
|
||||||
|
int numIndexThreads,
|
||||||
|
VectorEncoding vectorEncoding,
|
||||||
|
int dim,
|
||||||
|
VectorSimilarityFunction similarityFunction,
|
||||||
|
int numDocs
|
||||||
|
) {
|
||||||
|
this.docsPath = docsPath;
|
||||||
|
this.indexPath = indexPath;
|
||||||
|
this.codec = codec;
|
||||||
|
this.numIndexThreads = numIndexThreads;
|
||||||
|
this.vectorEncoding = vectorEncoding;
|
||||||
|
this.dim = dim;
|
||||||
|
this.similarityFunction = similarityFunction;
|
||||||
|
this.numDocs = numDocs;
|
||||||
|
}
|
||||||
|
|
||||||
|
void numSegments(KnnIndexTester.Results result) {
|
||||||
|
try (FSDirectory dir = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(dir)) {
|
||||||
|
result.numSegments = reader.leaves().size();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new UncheckedIOException("Failed to get segment count for index at " + indexPath, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void createIndex(KnnIndexTester.Results result) throws IOException, InterruptedException, ExecutionException {
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE);
|
||||||
|
iwc.setCodec(codec);
|
||||||
|
iwc.setRAMBufferSizeMB(WRITER_BUFFER_MB);
|
||||||
|
iwc.setUseCompoundFile(false);
|
||||||
|
|
||||||
|
iwc.setMaxFullFlushMergeWaitMillis(0);
|
||||||
|
|
||||||
|
FieldType fieldType = switch (vectorEncoding) {
|
||||||
|
case BYTE -> KnnByteVectorField.createFieldType(dim, similarityFunction);
|
||||||
|
case FLOAT32 -> KnnFloatVectorField.createFieldType(dim, similarityFunction);
|
||||||
|
};
|
||||||
|
iwc.setInfoStream(new PrintStreamInfoStream(System.out) {
|
||||||
|
@Override
|
||||||
|
public boolean isEnabled(String component) {
|
||||||
|
return Objects.equals(component, "IVF");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
logger.debug(
|
||||||
|
"KnnIndexer: using codec=%s, vectorEncoding=%s, dim=%d, similarityFunction=%s",
|
||||||
|
codec.getName(),
|
||||||
|
vectorEncoding,
|
||||||
|
dim,
|
||||||
|
similarityFunction
|
||||||
|
);
|
||||||
|
|
||||||
|
if (Files.exists(indexPath)) {
|
||||||
|
logger.debug("KnnIndexer: existing index at %s", indexPath);
|
||||||
|
} else {
|
||||||
|
Files.createDirectories(indexPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
long start = System.nanoTime();
|
||||||
|
try (
|
||||||
|
FSDirectory dir = FSDirectory.open(indexPath);
|
||||||
|
IndexWriter iw = new IndexWriter(dir, iwc);
|
||||||
|
FileChannel in = FileChannel.open(docsPath)
|
||||||
|
) {
|
||||||
|
long docsPathSizeInBytes = in.size();
|
||||||
|
if (docsPathSizeInBytes % ((long) dim * vectorEncoding.byteSize) != 0) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"docsPath \"" + docsPath + "\" does not contain a whole number of vectors? size=" + docsPathSizeInBytes
|
||||||
|
);
|
||||||
|
}
|
||||||
|
logger.info(
|
||||||
|
"docsPathSizeInBytes=%d, dim=%d, vectorEncoding=%s, byteSize=%d",
|
||||||
|
docsPathSizeInBytes,
|
||||||
|
dim,
|
||||||
|
vectorEncoding,
|
||||||
|
vectorEncoding.byteSize
|
||||||
|
);
|
||||||
|
|
||||||
|
VectorReader inReader = VectorReader.create(in, dim, vectorEncoding);
|
||||||
|
try (ExecutorService exec = Executors.newFixedThreadPool(numIndexThreads, r -> new Thread(r, "KnnIndexer-Thread"))) {
|
||||||
|
AtomicInteger numDocsIndexed = new AtomicInteger();
|
||||||
|
List<Future<?>> threads = new ArrayList<>();
|
||||||
|
for (int i = 0; i < numIndexThreads; i++) {
|
||||||
|
Thread t = new IndexerThread(iw, inReader, dim, vectorEncoding, fieldType, numDocsIndexed, numDocs);
|
||||||
|
t.setDaemon(true);
|
||||||
|
threads.add(exec.submit(t));
|
||||||
|
}
|
||||||
|
for (Future<?> t : threads) {
|
||||||
|
t.get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.debug("all indexing threads finished, now IndexWriter.commit()");
|
||||||
|
iw.commit();
|
||||||
|
ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler();
|
||||||
|
cms.sync();
|
||||||
|
}
|
||||||
|
|
||||||
|
long elapsed = System.nanoTime() - start;
|
||||||
|
logger.debug("Indexing took %d ms for %d docs", TimeUnit.NANOSECONDS.toMillis(elapsed), numDocs);
|
||||||
|
result.indexTimeMS = TimeUnit.NANOSECONDS.toMillis(elapsed);
|
||||||
|
}
|
||||||
|
|
||||||
|
void forceMerge(KnnIndexTester.Results results) throws Exception {
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.APPEND);
|
||||||
|
iwc.setInfoStream(new PrintStreamInfoStream(System.out) {
|
||||||
|
@Override
|
||||||
|
public boolean isEnabled(String component) {
|
||||||
|
return Objects.equals(component, "IVF");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
iwc.setCodec(codec);
|
||||||
|
logger.debug("KnnIndexer: forceMerge in %s", indexPath);
|
||||||
|
long startNS = System.nanoTime();
|
||||||
|
try (IndexWriter iw = new IndexWriter(FSDirectory.open(indexPath), iwc)) {
|
||||||
|
iw.forceMerge(1);
|
||||||
|
}
|
||||||
|
long endNS = System.nanoTime();
|
||||||
|
long elapsedNSec = (endNS - startNS);
|
||||||
|
logger.info("forceMerge took %d ms", TimeUnit.NANOSECONDS.toMillis(elapsedNSec));
|
||||||
|
results.forceMergeTimeMS = TimeUnit.NANOSECONDS.toMillis(elapsedNSec);
|
||||||
|
}
|
||||||
|
|
||||||
|
static class IndexerThread extends Thread {
|
||||||
|
private final IndexWriter iw;
|
||||||
|
private final AtomicInteger numDocsIndexed;
|
||||||
|
private final int numDocsToIndex;
|
||||||
|
private final FieldType fieldType;
|
||||||
|
private final VectorEncoding vectorEncoding;
|
||||||
|
private final byte[] byteVectorBuffer;
|
||||||
|
private final float[] floatVectorBuffer;
|
||||||
|
private final VectorReader in;
|
||||||
|
|
||||||
|
private IndexerThread(
|
||||||
|
IndexWriter iw,
|
||||||
|
VectorReader in,
|
||||||
|
int dims,
|
||||||
|
VectorEncoding vectorEncoding,
|
||||||
|
FieldType fieldType,
|
||||||
|
AtomicInteger numDocsIndexed,
|
||||||
|
int numDocsToIndex
|
||||||
|
) {
|
||||||
|
this.iw = iw;
|
||||||
|
this.in = in;
|
||||||
|
this.vectorEncoding = vectorEncoding;
|
||||||
|
this.fieldType = fieldType;
|
||||||
|
this.numDocsIndexed = numDocsIndexed;
|
||||||
|
this.numDocsToIndex = numDocsToIndex;
|
||||||
|
switch (vectorEncoding) {
|
||||||
|
case BYTE -> {
|
||||||
|
byteVectorBuffer = new byte[dims];
|
||||||
|
floatVectorBuffer = null;
|
||||||
|
}
|
||||||
|
case FLOAT32 -> {
|
||||||
|
floatVectorBuffer = new float[dims];
|
||||||
|
byteVectorBuffer = null;
|
||||||
|
}
|
||||||
|
default -> throw new IllegalArgumentException("unexpected vector encoding: " + vectorEncoding);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
_run();
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new UncheckedIOException(ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void _run() throws IOException {
|
||||||
|
while (true) {
|
||||||
|
int id = numDocsIndexed.getAndIncrement();
|
||||||
|
if (id >= numDocsToIndex) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
switch (vectorEncoding) {
|
||||||
|
case BYTE -> {
|
||||||
|
in.next(byteVectorBuffer);
|
||||||
|
doc.add(new KnnByteVectorField(VECTOR_FIELD, byteVectorBuffer, fieldType));
|
||||||
|
}
|
||||||
|
case FLOAT32 -> {
|
||||||
|
in.next(floatVectorBuffer);
|
||||||
|
doc.add(new KnnFloatVectorField(VECTOR_FIELD, floatVectorBuffer, fieldType));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((id + 1) % 25000 == 0) {
|
||||||
|
logger.debug("Done indexing " + (id + 1) + " documents.");
|
||||||
|
}
|
||||||
|
doc.add(new StoredField(ID_FIELD, id));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class VectorReader {
|
||||||
|
final float[] target;
|
||||||
|
final ByteBuffer bytes;
|
||||||
|
final FileChannel input;
|
||||||
|
long position;
|
||||||
|
|
||||||
|
static VectorReader create(FileChannel input, int dim, VectorEncoding vectorEncoding) throws IOException {
|
||||||
|
int bufferSize = dim * vectorEncoding.byteSize;
|
||||||
|
if (input.size() % ((long) dim * vectorEncoding.byteSize) != 0) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"vectors file \"" + input + "\" does not contain a whole number of vectors? size=" + input.size()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return new VectorReader(input, dim, bufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorReader(FileChannel input, int dim, int bufferSize) throws IOException {
|
||||||
|
this.bytes = ByteBuffer.wrap(new byte[bufferSize]).order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
this.input = input;
|
||||||
|
this.target = new float[dim];
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() throws IOException {
|
||||||
|
position = 0;
|
||||||
|
input.position(position);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readNext() throws IOException {
|
||||||
|
int bytesRead = Channels.readFromFileChannel(this.input, position, bytes);
|
||||||
|
if (bytesRead < bytes.capacity()) {
|
||||||
|
position = 0;
|
||||||
|
bytes.position(0);
|
||||||
|
// wrap around back to the start of the file if we hit the end:
|
||||||
|
logger.warn("VectorReader hit EOF when reading " + this.input + "; now wrapping around to start of file again");
|
||||||
|
this.input.position(position);
|
||||||
|
bytesRead = Channels.readFromFileChannel(this.input, position, bytes);
|
||||||
|
if (bytesRead < bytes.capacity()) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
"vector file " + input + " doesn't even have enough bytes for a single vector? got bytesRead=" + bytesRead
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
position += bytesRead;
|
||||||
|
bytes.position(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void next(float[] dest) throws IOException {
|
||||||
|
readNext();
|
||||||
|
bytes.asFloatBuffer().get(dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized void next(byte[] dest) throws IOException {
|
||||||
|
readNext();
|
||||||
|
bytes.get(dest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,488 @@
|
||||||
|
/*
|
||||||
|
* @notice
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
* a copy and modification from Lucene util
|
||||||
|
* Modifications copyright (C) 2025 Elasticsearch B.V.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.test.knn;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.StoredFields;
|
||||||
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.queries.function.FunctionQuery;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.ByteKnnVectorFieldSource;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.ByteVectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.ConstKnnByteVectorValueSource;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.ConstKnnFloatValueSource;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.FloatKnnVectorFieldSource;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.FloatVectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.TotalHits;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.store.MMapDirectory;
|
||||||
|
import org.elasticsearch.core.PathUtils;
|
||||||
|
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
|
||||||
|
import org.elasticsearch.search.profile.query.QueryProfiler;
|
||||||
|
import org.elasticsearch.search.vectors.ESKnnByteVectorQuery;
|
||||||
|
import org.elasticsearch.search.vectors.ESKnnFloatVectorQuery;
|
||||||
|
import org.elasticsearch.search.vectors.IVFKnnFloatVectorQuery;
|
||||||
|
import org.elasticsearch.search.vectors.QueryProfilerProvider;
|
||||||
|
import org.elasticsearch.search.vectors.RescoreKnnVectorQuery;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.IntBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.attribute.FileTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
import static org.elasticsearch.test.knn.KnnIndexTester.logger;
|
||||||
|
import static org.elasticsearch.test.knn.KnnIndexer.ID_FIELD;
|
||||||
|
import static org.elasticsearch.test.knn.KnnIndexer.VECTOR_FIELD;
|
||||||
|
|
||||||
|
class KnnSearcher {
|
||||||
|
|
||||||
|
private final Path docPath;
|
||||||
|
private final Path indexPath;
|
||||||
|
private final Path queryPath;
|
||||||
|
private final int numDocs;
|
||||||
|
private final int numQueryVectors;
|
||||||
|
private final long randomSeed = 42;
|
||||||
|
private final float selectivity = 1f;
|
||||||
|
private final int topK;
|
||||||
|
private final int efSearch;
|
||||||
|
private final int nProbe;
|
||||||
|
private final KnnIndexTester.IndexType indexType;
|
||||||
|
private final int dim;
|
||||||
|
private final VectorSimilarityFunction similarityFunction;
|
||||||
|
private final VectorEncoding vectorEncoding;
|
||||||
|
private final float overSamplingFactor;
|
||||||
|
private final int searchThreads;
|
||||||
|
|
||||||
|
KnnSearcher(Path indexPath, CmdLineArgs cmdLineArgs) {
|
||||||
|
this.docPath = cmdLineArgs.docVectors();
|
||||||
|
this.indexPath = indexPath;
|
||||||
|
this.queryPath = cmdLineArgs.queryVectors();
|
||||||
|
this.numDocs = cmdLineArgs.numDocs();
|
||||||
|
this.numQueryVectors = cmdLineArgs.numQueries();
|
||||||
|
this.topK = cmdLineArgs.k();
|
||||||
|
this.dim = cmdLineArgs.dimensions();
|
||||||
|
this.similarityFunction = cmdLineArgs.vectorSpace();
|
||||||
|
this.vectorEncoding = cmdLineArgs.vectorEncoding();
|
||||||
|
this.overSamplingFactor = cmdLineArgs.overSamplingFactor();
|
||||||
|
if (numQueryVectors <= 0) {
|
||||||
|
throw new IllegalArgumentException("numQueryVectors must be > 0");
|
||||||
|
}
|
||||||
|
this.efSearch = cmdLineArgs.numCandidates();
|
||||||
|
this.nProbe = cmdLineArgs.nProbe();
|
||||||
|
this.indexType = cmdLineArgs.indexType();
|
||||||
|
this.searchThreads = cmdLineArgs.searchThreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
void runSearch(KnnIndexTester.Results finalResults) throws IOException {
|
||||||
|
TopDocs[] results = new TopDocs[numQueryVectors];
|
||||||
|
int[][] resultIds = new int[numQueryVectors][];
|
||||||
|
long elapsed, totalCpuTimeMS, totalVisited = 0;
|
||||||
|
try (
|
||||||
|
FileChannel input = FileChannel.open(queryPath);
|
||||||
|
ExecutorService executorService = Executors.newFixedThreadPool(searchThreads, r -> new Thread(r, "KnnSearcher-Thread"))
|
||||||
|
) {
|
||||||
|
long queryPathSizeInBytes = input.size();
|
||||||
|
logger.info(
|
||||||
|
"queryPath size: "
|
||||||
|
+ queryPathSizeInBytes
|
||||||
|
+ " bytes, assuming vector count is "
|
||||||
|
+ (queryPathSizeInBytes / ((long) dim * vectorEncoding.byteSize))
|
||||||
|
);
|
||||||
|
KnnIndexer.VectorReader targetReader = KnnIndexer.VectorReader.create(input, dim, vectorEncoding);
|
||||||
|
long startNS;
|
||||||
|
try (MMapDirectory dir = new MMapDirectory(indexPath)) {
|
||||||
|
try (DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||||
|
IndexSearcher searcher = searchThreads > 1 ? new IndexSearcher(reader, executorService) : new IndexSearcher(reader);
|
||||||
|
byte[] targetBytes = new byte[dim];
|
||||||
|
float[] target = new float[dim];
|
||||||
|
// warm up
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
if (vectorEncoding.equals(VectorEncoding.BYTE)) {
|
||||||
|
targetReader.next(targetBytes);
|
||||||
|
doVectorQuery(targetBytes, searcher);
|
||||||
|
} else {
|
||||||
|
targetReader.next(target);
|
||||||
|
doVectorQuery(target, searcher);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
targetReader.reset();
|
||||||
|
startNS = System.nanoTime();
|
||||||
|
KnnIndexTester.ThreadDetails startThreadDetails = new KnnIndexTester.ThreadDetails();
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
if (vectorEncoding.equals(VectorEncoding.BYTE)) {
|
||||||
|
targetReader.next(targetBytes);
|
||||||
|
results[i] = doVectorQuery(targetBytes, searcher);
|
||||||
|
} else {
|
||||||
|
targetReader.next(target);
|
||||||
|
results[i] = doVectorQuery(target, searcher);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KnnIndexTester.ThreadDetails endThreadDetails = new KnnIndexTester.ThreadDetails();
|
||||||
|
elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNS);
|
||||||
|
long startCPUTimeNS = 0;
|
||||||
|
long endCPUTimeNS = 0;
|
||||||
|
for (int i = 0; i < startThreadDetails.threadInfos.length; i++) {
|
||||||
|
if (startThreadDetails.threadInfos[i].getThreadName().startsWith("KnnSearcher-Thread")) {
|
||||||
|
startCPUTimeNS += startThreadDetails.cpuTimesNS[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < endThreadDetails.threadInfos.length; i++) {
|
||||||
|
if (endThreadDetails.threadInfos[i].getThreadName().startsWith("KnnSearcher-Thread")) {
|
||||||
|
endCPUTimeNS += endThreadDetails.cpuTimesNS[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
totalCpuTimeMS = TimeUnit.NANOSECONDS.toMillis(endCPUTimeNS - startCPUTimeNS);
|
||||||
|
|
||||||
|
// Fetch, validate and write result document ids.
|
||||||
|
StoredFields storedFields = reader.storedFields();
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
totalVisited += results[i].totalHits.value();
|
||||||
|
resultIds[i] = getResultIds(results[i], storedFields);
|
||||||
|
}
|
||||||
|
logger.info(
|
||||||
|
"completed %d searches in %d ms: %d QPS CPU time=%dms",
|
||||||
|
numQueryVectors,
|
||||||
|
elapsed,
|
||||||
|
(1000L * numQueryVectors) / elapsed,
|
||||||
|
totalCpuTimeMS
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.info("checking results");
|
||||||
|
int[][] nn = getOrCalculateExactNN();
|
||||||
|
finalResults.avgRecall = checkResults(resultIds, nn, topK);
|
||||||
|
finalResults.qps = (1000f * numQueryVectors) / elapsed;
|
||||||
|
finalResults.avgLatency = (float) elapsed / numQueryVectors;
|
||||||
|
finalResults.averageVisited = (double) totalVisited / numQueryVectors;
|
||||||
|
finalResults.netCpuTimeMS = (double) totalCpuTimeMS / numQueryVectors;
|
||||||
|
finalResults.avgCpuCount = (double) totalCpuTimeMS / elapsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int[][] getOrCalculateExactNN() throws IOException {
|
||||||
|
// look in working directory for cached nn file
|
||||||
|
String hash = Integer.toString(
|
||||||
|
Objects.hash(
|
||||||
|
docPath,
|
||||||
|
indexPath,
|
||||||
|
queryPath,
|
||||||
|
numDocs,
|
||||||
|
numQueryVectors,
|
||||||
|
topK,
|
||||||
|
similarityFunction.ordinal(),
|
||||||
|
selectivity,
|
||||||
|
randomSeed
|
||||||
|
),
|
||||||
|
36
|
||||||
|
);
|
||||||
|
String nnFileName = "nn-" + hash + ".bin";
|
||||||
|
Path nnPath = PathUtils.get("target/" + nnFileName);
|
||||||
|
if (Files.exists(nnPath) && isNewer(nnPath, docPath, indexPath, queryPath)) {
|
||||||
|
logger.info("read pre-cached exact match vectors from cache file \"" + nnPath + "\"");
|
||||||
|
return readExactNN(nnPath);
|
||||||
|
} else {
|
||||||
|
logger.info("computing brute-force exact KNN matches for " + numQueryVectors + " query vectors from \"" + queryPath + "\"");
|
||||||
|
long startNS = System.nanoTime();
|
||||||
|
// TODO: enable computing NN from high precision vectors when
|
||||||
|
// checking low-precision recall
|
||||||
|
int[][] nn;
|
||||||
|
if (vectorEncoding.equals(VectorEncoding.BYTE)) {
|
||||||
|
nn = computeExactNNByte(queryPath);
|
||||||
|
} else {
|
||||||
|
nn = computeExactNN(queryPath);
|
||||||
|
}
|
||||||
|
writeExactNN(nn, nnPath);
|
||||||
|
long elapsedMS = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNS); // ns -> ms
|
||||||
|
logger.info("computed " + numQueryVectors + " exact matches in " + elapsedMS + " ms");
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isNewer(Path path, Path... others) throws IOException {
|
||||||
|
FileTime modified = Files.getLastModifiedTime(path);
|
||||||
|
for (Path other : others) {
|
||||||
|
if (Files.getLastModifiedTime(other).compareTo(modified) >= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
TopDocs doVectorQuery(byte[] vector, IndexSearcher searcher) throws IOException {
|
||||||
|
Query knnQuery;
|
||||||
|
if (overSamplingFactor > 1f) {
|
||||||
|
throw new IllegalArgumentException("oversampling factor > 1 is not supported for byte vectors");
|
||||||
|
}
|
||||||
|
if (indexType == KnnIndexTester.IndexType.IVF) {
|
||||||
|
throw new IllegalArgumentException("IVF index type does not support byte vectors");
|
||||||
|
} else {
|
||||||
|
knnQuery = new ESKnnByteVectorQuery(
|
||||||
|
VECTOR_FIELD,
|
||||||
|
vector,
|
||||||
|
topK,
|
||||||
|
efSearch,
|
||||||
|
null,
|
||||||
|
DenseVectorFieldMapper.FilterHeuristic.ACORN.getKnnSearchStrategy()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
QueryProfiler profiler = new QueryProfiler();
|
||||||
|
TopDocs docs = searcher.search(knnQuery, this.topK);
|
||||||
|
QueryProfilerProvider queryProfilerProvider = (QueryProfilerProvider) knnQuery;
|
||||||
|
queryProfilerProvider.profile(profiler);
|
||||||
|
return new TopDocs(new TotalHits(profiler.getVectorOpsCount(), docs.totalHits.relation()), docs.scoreDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
TopDocs doVectorQuery(float[] vector, IndexSearcher searcher) throws IOException {
|
||||||
|
Query knnQuery;
|
||||||
|
int topK = this.topK;
|
||||||
|
int efSearch = this.efSearch;
|
||||||
|
if (overSamplingFactor > 1f) {
|
||||||
|
// oversample the topK results to get more candidates for the final result
|
||||||
|
topK = (int) Math.ceil(topK * overSamplingFactor);
|
||||||
|
efSearch = Math.max(topK, efSearch);
|
||||||
|
}
|
||||||
|
if (indexType == KnnIndexTester.IndexType.IVF) {
|
||||||
|
knnQuery = new IVFKnnFloatVectorQuery(VECTOR_FIELD, vector, topK, efSearch, null, nProbe);
|
||||||
|
} else {
|
||||||
|
knnQuery = new ESKnnFloatVectorQuery(
|
||||||
|
VECTOR_FIELD,
|
||||||
|
vector,
|
||||||
|
topK,
|
||||||
|
efSearch,
|
||||||
|
null,
|
||||||
|
DenseVectorFieldMapper.FilterHeuristic.ACORN.getKnnSearchStrategy()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (overSamplingFactor > 1f) {
|
||||||
|
// oversample the topK results to get more candidates for the final result
|
||||||
|
knnQuery = new RescoreKnnVectorQuery(VECTOR_FIELD, vector, similarityFunction, this.topK, knnQuery);
|
||||||
|
}
|
||||||
|
QueryProfiler profiler = new QueryProfiler();
|
||||||
|
TopDocs docs = searcher.search(knnQuery, this.topK);
|
||||||
|
QueryProfilerProvider queryProfilerProvider = (QueryProfilerProvider) knnQuery;
|
||||||
|
queryProfilerProvider.profile(profiler);
|
||||||
|
return new TopDocs(new TotalHits(profiler.getVectorOpsCount(), docs.totalHits.relation()), docs.scoreDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float checkResults(int[][] results, int[][] nn, int topK) {
|
||||||
|
int totalMatches = 0;
|
||||||
|
int totalResults = results.length * topK;
|
||||||
|
for (int i = 0; i < results.length; i++) {
|
||||||
|
totalMatches += compareNN(nn[i], results[i], topK);
|
||||||
|
}
|
||||||
|
return totalMatches / (float) totalResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int compareNN(int[] expected, int[] results, int topK) {
|
||||||
|
int matched = 0;
|
||||||
|
Set<Integer> expectedSet = new HashSet<>();
|
||||||
|
Set<Integer> alreadySeen = new HashSet<>();
|
||||||
|
for (int i = 0; i < topK; i++) {
|
||||||
|
expectedSet.add(expected[i]);
|
||||||
|
}
|
||||||
|
for (int docId : results) {
|
||||||
|
if (alreadySeen.add(docId) == false) {
|
||||||
|
throw new IllegalStateException("duplicate docId=" + docId);
|
||||||
|
}
|
||||||
|
if (expectedSet.contains(docId)) {
|
||||||
|
++matched;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return matched;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int[][] readExactNN(Path nnPath) throws IOException {
|
||||||
|
int[][] result = new int[numQueryVectors][];
|
||||||
|
try (FileChannel in = FileChannel.open(nnPath)) {
|
||||||
|
IntBuffer intBuffer = in.map(FileChannel.MapMode.READ_ONLY, 0, (long) numQueryVectors * topK * Integer.BYTES)
|
||||||
|
.order(ByteOrder.LITTLE_ENDIAN)
|
||||||
|
.asIntBuffer();
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
result[i] = new int[topK];
|
||||||
|
intBuffer.get(result[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeExactNN(int[][] nn, Path nnPath) throws IOException {
|
||||||
|
logger.info("writing true nearest neighbors to cache file \"" + nnPath + "\"");
|
||||||
|
ByteBuffer tmp = ByteBuffer.allocate(nn[0].length * Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
try (OutputStream out = Files.newOutputStream(nnPath)) {
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
tmp.asIntBuffer().put(nn[i]);
|
||||||
|
out.write(tmp.array());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int[][] computeExactNN(Path queryPath) throws IOException {
|
||||||
|
int[][] result = new int[numQueryVectors][];
|
||||||
|
try (Directory dir = FSDirectory.open(indexPath); DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||||
|
List<Callable<Void>> tasks = new ArrayList<>();
|
||||||
|
try (FileChannel qIn = FileChannel.open(queryPath)) {
|
||||||
|
KnnIndexer.VectorReader queryReader = KnnIndexer.VectorReader.create(qIn, dim, VectorEncoding.FLOAT32);
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
float[] queryVector = new float[dim];
|
||||||
|
queryReader.next(queryVector);
|
||||||
|
tasks.add(new ComputeNNFloatTask(i, topK, queryVector, result, reader, similarityFunction));
|
||||||
|
}
|
||||||
|
ForkJoinPool.commonPool().invokeAll(tasks);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int[][] computeExactNNByte(Path queryPath) throws IOException {
|
||||||
|
int[][] result = new int[numQueryVectors][];
|
||||||
|
try (Directory dir = FSDirectory.open(indexPath); DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||||
|
List<Callable<Void>> tasks = new ArrayList<>();
|
||||||
|
try (FileChannel qIn = FileChannel.open(queryPath)) {
|
||||||
|
KnnIndexer.VectorReader queryReader = KnnIndexer.VectorReader.create(qIn, dim, VectorEncoding.BYTE);
|
||||||
|
for (int i = 0; i < numQueryVectors; i++) {
|
||||||
|
byte[] queryVector = new byte[dim];
|
||||||
|
queryReader.next(queryVector);
|
||||||
|
tasks.add(new ComputeNNByteTask(i, queryVector, result, reader, similarityFunction));
|
||||||
|
}
|
||||||
|
ForkJoinPool.commonPool().invokeAll(tasks);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class ComputeNNFloatTask implements Callable<Void> {
|
||||||
|
|
||||||
|
private final int queryOrd;
|
||||||
|
private final float[] query;
|
||||||
|
private final int[][] result;
|
||||||
|
private final IndexReader reader;
|
||||||
|
private final VectorSimilarityFunction similarityFunction;
|
||||||
|
private final int topK;
|
||||||
|
|
||||||
|
ComputeNNFloatTask(
|
||||||
|
int queryOrd,
|
||||||
|
int topK,
|
||||||
|
float[] query,
|
||||||
|
int[][] result,
|
||||||
|
IndexReader reader,
|
||||||
|
VectorSimilarityFunction similarityFunction
|
||||||
|
) {
|
||||||
|
this.queryOrd = queryOrd;
|
||||||
|
this.query = query;
|
||||||
|
this.result = result;
|
||||||
|
this.reader = reader;
|
||||||
|
this.similarityFunction = similarityFunction;
|
||||||
|
this.topK = topK;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Void call() {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
try {
|
||||||
|
var queryVector = new ConstKnnFloatValueSource(query);
|
||||||
|
var docVectors = new FloatKnnVectorFieldSource(VECTOR_FIELD);
|
||||||
|
Query query = new FunctionQuery(new FloatVectorSimilarityFunction(similarityFunction, queryVector, docVectors));
|
||||||
|
var topDocs = searcher.search(query, topK);
|
||||||
|
result[queryOrd] = getResultIds(topDocs, reader.storedFields());
|
||||||
|
if ((queryOrd + 1) % 10 == 0) {
|
||||||
|
logger.info(" exact knn scored " + (queryOrd + 1));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class ComputeNNByteTask implements Callable<Void> {
|
||||||
|
|
||||||
|
private final int queryOrd;
|
||||||
|
private final byte[] query;
|
||||||
|
private final int[][] result;
|
||||||
|
private final IndexReader reader;
|
||||||
|
private final VectorSimilarityFunction similarityFunction;
|
||||||
|
|
||||||
|
ComputeNNByteTask(int queryOrd, byte[] query, int[][] result, IndexReader reader, VectorSimilarityFunction similarityFunction) {
|
||||||
|
this.queryOrd = queryOrd;
|
||||||
|
this.query = query;
|
||||||
|
this.result = result;
|
||||||
|
this.reader = reader;
|
||||||
|
this.similarityFunction = similarityFunction;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Void call() {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
int topK = result[0].length;
|
||||||
|
try {
|
||||||
|
var queryVector = new ConstKnnByteVectorValueSource(query);
|
||||||
|
var docVectors = new ByteKnnVectorFieldSource(VECTOR_FIELD);
|
||||||
|
Query query = new FunctionQuery(new ByteVectorSimilarityFunction(similarityFunction, queryVector, docVectors));
|
||||||
|
var topDocs = searcher.search(query, topK);
|
||||||
|
result[queryOrd] = getResultIds(topDocs, reader.storedFields());
|
||||||
|
if ((queryOrd + 1) % 10 == 0) {
|
||||||
|
logger.info(" exact knn scored " + (queryOrd + 1));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int[] getResultIds(TopDocs topDocs, StoredFields storedFields) throws IOException {
|
||||||
|
int[] resultIds = new int[topDocs.scoreDocs.length];
|
||||||
|
int i = 0;
|
||||||
|
for (ScoreDoc doc : topDocs.scoreDocs) {
|
||||||
|
if (doc.doc != NO_MORE_DOCS) {
|
||||||
|
// there is a bug somewhere that can result in doc=NO_MORE_DOCS! I think it happens
|
||||||
|
// in some degenerate case (like input query has NaN in it?) that causes no results to
|
||||||
|
// be returned from HNSW search?
|
||||||
|
resultIds[i++] = Integer.parseInt(storedFields.document(doc.doc).get(ID_FIELD));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return resultIds;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -479,4 +479,6 @@ module org.elasticsearch.server {
|
||||||
exports org.elasticsearch.lucene.util.automaton;
|
exports org.elasticsearch.lucene.util.automaton;
|
||||||
exports org.elasticsearch.index.codec.perfield;
|
exports org.elasticsearch.index.codec.perfield;
|
||||||
exports org.elasticsearch.lucene.search;
|
exports org.elasticsearch.lucene.search;
|
||||||
|
exports org.elasticsearch.index.codec.vectors to org.elasticsearch.test.knn;
|
||||||
|
exports org.elasticsearch.index.codec.vectors.es818 to org.elasticsearch.test.knn;
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,7 @@ public abstract class IVFVectorsReader extends KnnVectorsReader {
|
||||||
protected final IntObjectHashMap<FieldEntry> fields;
|
protected final IntObjectHashMap<FieldEntry> fields;
|
||||||
private final FlatVectorsReader rawVectorsReader;
|
private final FlatVectorsReader rawVectorsReader;
|
||||||
|
|
||||||
|
@SuppressWarnings("this-escape")
|
||||||
protected IVFVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader) throws IOException {
|
protected IVFVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader) throws IOException {
|
||||||
this.state = state;
|
this.state = state;
|
||||||
this.fieldInfos = state.fieldInfos;
|
this.fieldInfos = state.fieldInfos;
|
||||||
|
|
|
@ -55,6 +55,7 @@ public abstract class IVFVectorsWriter extends KnnVectorsWriter {
|
||||||
private final FlatVectorsWriter rawVectorDelegate;
|
private final FlatVectorsWriter rawVectorDelegate;
|
||||||
private final SegmentWriteState segmentWriteState;
|
private final SegmentWriteState segmentWriteState;
|
||||||
|
|
||||||
|
@SuppressWarnings("this-escape")
|
||||||
protected IVFVectorsWriter(SegmentWriteState state, FlatVectorsWriter rawVectorDelegate) throws IOException {
|
protected IVFVectorsWriter(SegmentWriteState state, FlatVectorsWriter rawVectorDelegate) throws IOException {
|
||||||
this.segmentWriteState = state;
|
this.segmentWriteState = state;
|
||||||
this.rawVectorDelegate = rawVectorDelegate;
|
this.rawVectorDelegate = rawVectorDelegate;
|
||||||
|
|
|
@ -69,6 +69,7 @@ public class ES816BinaryQuantizedVectorsReader extends FlatVectorsReader impleme
|
||||||
private final FlatVectorsReader rawVectorsReader;
|
private final FlatVectorsReader rawVectorsReader;
|
||||||
private final ES816BinaryFlatVectorsScorer vectorScorer;
|
private final ES816BinaryFlatVectorsScorer vectorScorer;
|
||||||
|
|
||||||
|
@SuppressWarnings("this-escape")
|
||||||
ES816BinaryQuantizedVectorsReader(
|
ES816BinaryQuantizedVectorsReader(
|
||||||
SegmentReadState state,
|
SegmentReadState state,
|
||||||
FlatVectorsReader rawVectorsReader,
|
FlatVectorsReader rawVectorsReader,
|
||||||
|
|
|
@ -65,6 +65,7 @@ public class DirectIOLucene99FlatVectorsReader extends FlatVectorsReader impleme
|
||||||
private final IndexInput vectorData;
|
private final IndexInput vectorData;
|
||||||
private final FieldInfos fieldInfos;
|
private final FieldInfos fieldInfos;
|
||||||
|
|
||||||
|
@SuppressWarnings("this-escape")
|
||||||
public DirectIOLucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer) throws IOException {
|
public DirectIOLucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer) throws IOException {
|
||||||
super(scorer);
|
super(scorer);
|
||||||
int versionMeta = readMetadata(state);
|
int versionMeta = readMetadata(state);
|
||||||
|
|
|
@ -70,6 +70,7 @@ public class ES818BinaryQuantizedVectorsReader extends FlatVectorsReader impleme
|
||||||
private final FlatVectorsReader rawVectorsReader;
|
private final FlatVectorsReader rawVectorsReader;
|
||||||
private final ES818BinaryFlatVectorsScorer vectorScorer;
|
private final ES818BinaryFlatVectorsScorer vectorScorer;
|
||||||
|
|
||||||
|
@SuppressWarnings("this-escape")
|
||||||
ES818BinaryQuantizedVectorsReader(
|
ES818BinaryQuantizedVectorsReader(
|
||||||
SegmentReadState state,
|
SegmentReadState state,
|
||||||
FlatVectorsReader rawVectorsReader,
|
FlatVectorsReader rawVectorsReader,
|
||||||
|
|
|
@ -84,6 +84,7 @@ public class ES818BinaryQuantizedVectorsWriter extends FlatVectorsWriter {
|
||||||
*
|
*
|
||||||
* @param vectorsScorer the scorer to use for scoring vectors
|
* @param vectorsScorer the scorer to use for scoring vectors
|
||||||
*/
|
*/
|
||||||
|
@SuppressWarnings("this-escape")
|
||||||
protected ES818BinaryQuantizedVectorsWriter(
|
protected ES818BinaryQuantizedVectorsWriter(
|
||||||
ES818BinaryFlatVectorsScorer vectorsScorer,
|
ES818BinaryFlatVectorsScorer vectorsScorer,
|
||||||
FlatVectorsWriter rawVectorDelegate,
|
FlatVectorsWriter rawVectorDelegate,
|
||||||
|
|
|
@ -171,3 +171,5 @@ if (extraProjects.exists()) {
|
||||||
addSubProjects('', extraProjectDir)
|
addSubProjects('', extraProjectDir)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
include 'qa:vector'
|
Loading…
Add table
Add a link
Reference in a new issue