mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-04-25 07:37:19 -04:00
This change adds support for kNN vector fields to the `_disk_usage` API. The strategy: * Iterate the vector values (using the same strategy as for doc values) to estimate the vector data size * Run some random vector searches to estimate the vector index size Co-authored-by: Yannick Welsch <yannick@welsch.lu> Closes #84801
190 lines
6.3 KiB
Text
190 lines
6.3 KiB
Text
[[indices-disk-usage]]
|
|
=== Analyze index disk usage API
|
|
++++
|
|
<titleabbrev>Analyze index disk usage</titleabbrev>
|
|
++++
|
|
|
|
experimental[]
|
|
|
|
Analyzes the disk usage of each field of an index or data stream.
|
|
This API might not support indices created in previous {es} versions.
|
|
The result of a small index can be inaccurate as some parts of an index
|
|
might not be analyzed by the API.
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST /my-index-000001/_disk_usage?run_expensive_tasks=true
|
|
--------------------------------------------------
|
|
// TEST[setup:messages]
|
|
|
|
[[analyze-index-disk-usage-api-request]]
|
|
==== {api-request-title}
|
|
|
|
`POST /<target>/_disk_usage`
|
|
|
|
[[analyze-index-disk-usage-api-request-prereqs]]
|
|
==== {api-prereq-title}
|
|
|
|
* If the {es} {security-features} are enabled, you must have the `manage`
|
|
<<privileges-list-indices,index privilege>> for the target index, data stream,
|
|
or alias.
|
|
|
|
[[analyze-index-disk-usage-api-path-params]]
|
|
==== {api-path-parms-title}
|
|
|
|
`<target>`::
|
|
(Required, string) Comma-separated list of data streams, indices, and aliases
|
|
used to limit the request. It's recommended to execute this API with a single
|
|
index (or the latest backing index of a data stream) as the API consumes
|
|
resources significantly.
|
|
|
|
[[analyze-index-disk-usage-api-query-params]]
|
|
==== {api-query-parms-title}
|
|
|
|
include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=allow-no-indices]
|
|
+
|
|
Defaults to `true`.
|
|
|
|
include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=expand-wildcards]
|
|
+
|
|
Defaults to `open`.
|
|
|
|
`flush`::
|
|
(Optional, Boolean) If `true`, the API performs a flush before analysis. If
|
|
`false`, the response may not include uncommitted data. Defaults to `true`.
|
|
|
|
include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=index-ignore-unavailable]
|
|
|
|
`run_expensive_tasks`::
|
|
(Required, Boolean) Analyzing field disk usage is resource-intensive. To use the
|
|
API, this parameter must be set to `true`. Defaults to `false`.
|
|
|
|
include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=wait_for_active_shards]
|
|
|
|
|
|
[[analyze-index-disk-usage-api-example]]
|
|
==== {api-examples-title}
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST /my-index-000001/_disk_usage?run_expensive_tasks=true
|
|
--------------------------------------------------
|
|
// TEST[setup:messages]
|
|
|
|
The API returns:
|
|
|
|
[source,console-response]
|
|
--------------------------------------------------
|
|
{
|
|
"_shards": {
|
|
"total": 1,
|
|
"successful": 1,
|
|
"failed": 0
|
|
},
|
|
"my-index-000001": {
|
|
"store_size": "929mb", <1>
|
|
"store_size_in_bytes": 974192723,
|
|
"all_fields": {
|
|
"total": "928.9mb", <2>
|
|
"total_in_bytes": 973977084,
|
|
"inverted_index": {
|
|
"total": "107.8mb",
|
|
"total_in_bytes": 113128526
|
|
},
|
|
"stored_fields": "623.5mb",
|
|
"stored_fields_in_bytes": 653819143,
|
|
"doc_values": "125.7mb",
|
|
"doc_values_in_bytes": 131885142,
|
|
"points": "59.9mb",
|
|
"points_in_bytes": 62885773,
|
|
"norms": "2.3kb",
|
|
"norms_in_bytes": 2356,
|
|
"term_vectors": "2.2kb",
|
|
"term_vectors_in_bytes": 2310,
|
|
"knn_vectors": "0b",
|
|
"knn_vectors_in_bytes": 0
|
|
},
|
|
"fields": {
|
|
"_id": {
|
|
"total": "49.3mb",
|
|
"total_in_bytes": 51709993,
|
|
"inverted_index": {
|
|
"total": "29.7mb",
|
|
"total_in_bytes": 31172745
|
|
},
|
|
"stored_fields": "19.5mb", <3>
|
|
"stored_fields_in_bytes": 20537248,
|
|
"doc_values": "0b",
|
|
"doc_values_in_bytes": 0,
|
|
"points": "0b",
|
|
"points_in_bytes": 0,
|
|
"norms": "0b",
|
|
"norms_in_bytes": 0,
|
|
"term_vectors": "0b",
|
|
"term_vectors_in_bytes": 0,
|
|
"knn_vectors": "0b",
|
|
"knn_vectors_in_bytes": 0
|
|
},
|
|
"_primary_term": {...},
|
|
"_seq_no": {...},
|
|
"_version": {...},
|
|
"_source": {
|
|
"total": "603.9mb",
|
|
"total_in_bytes": 633281895,
|
|
"inverted_index": {...},
|
|
"stored_fields": "603.9mb", <4>
|
|
"stored_fields_in_bytes": 633281895,
|
|
"doc_values": "0b",
|
|
"doc_values_in_bytes": 0,
|
|
"points": "0b",
|
|
"points_in_bytes": 0,
|
|
"norms": "0b",
|
|
"norms_in_bytes": 0,
|
|
"term_vectors": "0b",
|
|
"term_vectors_in_bytes": 0,
|
|
"knn_vectors": "0b",
|
|
"knn_vectors_in_bytes": 0
|
|
},
|
|
"context": {
|
|
"total": "28.6mb",
|
|
"total_in_bytes": 30060405,
|
|
"inverted_index": {
|
|
"total": "22mb",
|
|
"total_in_bytes": 23090908
|
|
},
|
|
"stored_fields": "0b",
|
|
"stored_fields_in_bytes": 0,
|
|
"doc_values": "0b",
|
|
"doc_values_in_bytes": 0,
|
|
"points": "0b",
|
|
"points_in_bytes": 0,
|
|
"norms": "2.3kb",
|
|
"norms_in_bytes": 2356,
|
|
"term_vectors": "2.2kb",
|
|
"term_vectors_in_bytes": 2310,
|
|
"knn_vectors": "0b",
|
|
"knn_vectors_in_bytes": 0
|
|
},
|
|
"context.keyword": {...},
|
|
"message": {...},
|
|
"message.keyword": {...}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// TESTRESPONSE[s/: \{\.\.\.\}/: $body.$_path/]
|
|
// TESTRESPONSE[s/: (\-)?[0-9]+/: $body.$_path/]
|
|
// TESTRESPONSE[s/: "[^"]*"/: $body.$_path/]
|
|
|
|
<1> The store size of only analyzed shards of the index.
|
|
|
|
<2> The total size of fields of the analyzed shards of the index. This total
|
|
is usually smaller than the index size specified in <1> as some small metadata
|
|
files are ignored and some parts of data files might not be scanned by the API.
|
|
|
|
<3> The stored size of the `_id` field
|
|
|
|
<4> The stored size of the `_source` field. As stored fields are stored
|
|
together in a compressed format, the estimated sizes of stored fields are
|
|
best efforts and can be inaccurate. The stored size of the `_id` field
|
|
is likely underestimated while the `_source` field is overestimated.
|