diff --git a/docs/changelog/94954.yaml b/docs/changelog/94954.yaml index 06745c81ce9a..aaac45153bef 100644 --- a/docs/changelog/94954.yaml +++ b/docs/changelog/94954.yaml @@ -1,5 +1,25 @@ pr: 94954 -summary: Asset tracking - geo_line for TSDB +summary: Asset tracking - geo_line in time-series aggregations area: Geo type: enhancement -issues: [] +issues: [ ] +highlight: + title: Asset tracking - geo_line in time-series aggregations + body: |- + The <> builds tracks from `geo_points`. + It has previously needed to use large arrays in memory for collecting points into multiple buckets + and sorting those buckets. + + With the advances made in TSDB features and the `time_series` aggregation in particular, + it is now possible to rely on data aggregating in both TSID and timestamp order, + enabling the removal of all sorting, as well as the use of only a single bucket's + worth of memory, a dramatic improvement in memory footprint. In addition, we can use the streaming line + simplifier algorithm introduced in https://github.com/elastic/elasticsearch/pull/94859 to replace the previous + behaviour of truncating very large tracks with the far more preferable approach of simplifying those tracks. + + [role="screenshot"] + image:images/spatial/kodiak_geo_line_simplified.png[North short of Kodiak Island simplified to 100 points] + + In this diagram, the grey line is the original geometry, the blue line is the truncated geometry as would be + produced by the original `geo_line` aggregation, and the magenta line is the new simplified geometry. + notable: false diff --git a/docs/reference/aggregations/metrics/geoline-aggregation.asciidoc b/docs/reference/aggregations/metrics/geoline-aggregation.asciidoc index c5b54f8ecd08..4ed684aa090d 100644 --- a/docs/reference/aggregations/metrics/geoline-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/geoline-aggregation.asciidoc @@ -5,7 +5,7 @@ Geo-Line ++++ -The `geo_line` aggregation aggregates all `geo_point` values within a bucket into a LineString ordered +The `geo_line` aggregation aggregates all `geo_point` values within a bucket into a `LineString` ordered by the chosen `sort` field. This `sort` can be a date field, for example. The bucket returned is a valid https://tools.ietf.org/html/rfc7946#section-3.2[GeoJSON Feature] representing the line geometry. @@ -14,31 +14,25 @@ https://tools.ietf.org/html/rfc7946#section-3.2[GeoJSON Feature] representing th PUT test { "mappings": { - "dynamic": "strict", - "_source": { - "enabled": false - }, "properties": { - "my_location": { - "type": "geo_point" - }, - "group": { - "type": "keyword" - }, - "@timestamp": { - "type": "date" - } + "my_location": { "type": "geo_point" }, + "group": { "type": "keyword" }, + "@timestamp": { "type": "date" } } } } POST /test/_bulk?refresh -{"index": {}} -{"my_location": {"lat":37.3450570, "lon": -122.0499820}, "@timestamp": "2013-09-06T16:00:36"} -{"index": {}} -{"my_location": {"lat": 37.3451320, "lon": -122.0499820}, "@timestamp": "2013-09-06T16:00:37Z"} -{"index": {}} -{"my_location": {"lat": 37.349283, "lon": -122.0505010}, "@timestamp": "2013-09-06T16:00:37Z"} +{"index":{}} +{"my_location": {"lat":52.373184, "lon":4.889187}, "@timestamp": "2023-01-02T09:00:00Z"} +{"index":{}} +{"my_location": {"lat":52.370159, "lon":4.885057}, "@timestamp": "2023-01-02T10:00:00Z"} +{"index":{}} +{"my_location": {"lat":52.369219, "lon":4.901618}, "@timestamp": "2023-01-02T13:00:00Z"} +{"index":{}} +{"my_location": {"lat":52.374081, "lon":4.912350}, "@timestamp": "2023-01-02T16:00:00Z"} +{"index":{}} +{"my_location": {"lat":52.371667, "lon":4.914722}, "@timestamp": "2023-01-03T12:00:00Z"} POST /test/_search?filter_path=aggregations { @@ -46,7 +40,7 @@ POST /test/_search?filter_path=aggregations "line": { "geo_line": { "point": {"field": "my_location"}, - "sort": {"field": "@timestamp"} + "sort": {"field": "@timestamp"} } } } @@ -60,26 +54,19 @@ Which returns: { "aggregations": { "line": { - "type" : "Feature", - "geometry" : { - "type" : "LineString", - "coordinates" : [ - [ - -122.049982, - 37.345057 - ], - [ - -122.050501, - 37.349283 - ], - [ - -122.049982, - 37.345132 - ] + "type": "Feature", + "geometry": { + "type": "LineString", + "coordinates": [ + [ 4.889187, 52.373184 ], + [ 4.885057, 52.370159 ], + [ 4.901618, 52.369219 ], + [ 4.912350, 52.374081 ], + [ 4.914722, 52.371667 ] ] }, - "properties" : { - "complete" : true + "properties": { + "complete": true } } } @@ -87,6 +74,19 @@ Which returns: ---- // TESTRESPONSE +The resulting https://tools.ietf.org/html/rfc7946#section-3.2[GeoJSON Feature] contains both a `LineString` geometry +for the path generated by the aggregation, as well as a map of `properties`. +The property `complete` informs of whether all documents matched were used to generate the geometry. +The `size` option described below can be used to limit the number of documents included in the aggregation, +leading to results with `complete: false`. +Exactly which documents are dropped from results depends on whether the aggregation is based +on `time_series` or not, and this is discussed in +<>. + +The above result could be displayed in a map user interface: + +image:images/spatial/geo_line.png[Kibana map with museum tour of Amsterdam] + [[search-aggregations-metrics-geo-line-options]] ==== Options @@ -106,37 +106,378 @@ Example usage configuring `my_location` as the point field: // NOTCONSOLE `sort`:: -(Required) +(Required outside <> aggregations) -This option specifies the name of the numeric field to use as the sort key -for ordering the points +This option specifies the name of the numeric field to use as the sort key for ordering the points. +When the `geo_line` aggregation is nested inside a +<> +aggregation, this field defaults to `@timestamp`, and any other value will result in error. Example usage configuring `@timestamp` as the sort key: [source,js] ---- -"point": { +"sort": { "field": "@timestamp" } ---- // NOTCONSOLE `include_sort`:: -(Optional, boolean, default: `false`) - -This option includes, when true, an additional array of the sort values in the +(Optional, boolean, default: `false`) This option includes, when true, an additional array of the sort values in the feature properties. `sort_order`:: -(Optional, string, default: `"ASC"`) - -This option accepts one of two values: "ASC", "DESC". - +(Optional, string, default: `"ASC"`) This option accepts one of two values: "ASC", "DESC". The line is sorted in ascending order by the sort key when set to "ASC", and in descending with "DESC". `size`:: -(Optional, integer, default: `10000`) +(Optional, integer, default: `10000`) The maximum length of the line represented in the aggregation. +Valid sizes are between one and 10000. +Within <> +the aggregation uses line simplification to constrain the size, otherwise it uses truncation. +See <> +for a discussion on the subtleties involved. -The maximum length of the line represented in the aggregation. Valid sizes are -between one and 10000. +[[search-aggregations-metrics-geo-line-grouping]] +==== Grouping + +The simple example above will produce a single track for all the data selected by the query. However, it is far more +common to need to group the data into multiple tracks. For example, grouping flight transponder measurements by +flight call-sign before sorting each flight by timestamp and producing a separate track for each. + +In the following examples we will group the locations of points of interest in the cities of +Amsterdam, Antwerp and Paris. +The tracks will be ordered by the planned visit sequence for a walking tour of the museums and others attractions. + +In order to demonstrate the difference between a time-series grouping and a non-time-series grouping, we will +first create an index with <>, +and then give examples of grouping the same data without time-series and with time-series. + +[source,console,id=search-aggregations-metrics-geo-line-grouping-setup] +---- +PUT tour +{ + "mappings": { + "properties": { + "city": { + "type": "keyword", + "time_series_dimension": true + }, + "category": { "type": "keyword" }, + "route": { "type": "long" }, + "name": { "type": "keyword" }, + "location": { "type": "geo_point" }, + "@timestamp": { "type": "date" } + } + }, + "settings": { + "index": { + "mode": "time_series", + "routing_path": [ "city" ], + "time_series": { + "start_time": "2023-01-01T00:00:00Z", + "end_time": "2024-01-01T00:00:00Z" + } + } + } +} + +POST /tour/_bulk?refresh +{"index":{}} +{"@timestamp": "2023-01-02T09:00:00Z", "route": 0, "location": "POINT(4.889187 52.373184)", "city": "Amsterdam", "category": "Attraction", "name": "Royal Palace Amsterdam"} +{"index":{}} +{"@timestamp": "2023-01-02T10:00:00Z", "route": 1, "location": "POINT(4.885057 52.370159)", "city": "Amsterdam", "category": "Attraction", "name": "The Amsterdam Dungeon"} +{"index":{}} +{"@timestamp": "2023-01-02T13:00:00Z", "route": 2, "location": "POINT(4.901618 52.369219)", "city": "Amsterdam", "category": "Museum", "name": "Museum Het Rembrandthuis"} +{"index":{}} +{"@timestamp": "2023-01-02T16:00:00Z", "route": 3, "location": "POINT(4.912350 52.374081)", "city": "Amsterdam", "category": "Museum", "name": "NEMO Science Museum"} +{"index":{}} +{"@timestamp": "2023-01-03T12:00:00Z", "route": 4, "location": "POINT(4.914722 52.371667)", "city": "Amsterdam", "category": "Museum", "name": "Nederlands Scheepvaartmuseum"} +{"index":{}} +{"@timestamp": "2023-01-04T09:00:00Z", "route": 5, "location": "POINT(4.401384 51.220292)", "city": "Antwerp", "category": "Attraction", "name": "Cathedral of Our Lady"} +{"index":{}} +{"@timestamp": "2023-01-04T12:00:00Z", "route": 6, "location": "POINT(4.405819 51.221758)", "city": "Antwerp", "category": "Museum", "name": "Snijders&Rockoxhuis"} +{"index":{}} +{"@timestamp": "2023-01-04T15:00:00Z", "route": 7, "location": "POINT(4.405200 51.222900)", "city": "Antwerp", "category": "Museum", "name": "Letterenhuis"} +{"index":{}} +{"@timestamp": "2023-01-05T10:00:00Z", "route": 8, "location": "POINT(2.336389 48.861111)", "city": "Paris", "category": "Museum", "name": "Musée du Louvre"} +{"index":{}} +{"@timestamp": "2023-01-05T14:00:00Z", "route": 9, "location": "POINT(2.327000 48.860000)", "city": "Paris", "category": "Museum", "name": "Musée dOrsay"} +---- + +[[search-aggregations-metrics-geo-line-grouping-terms]] +==== Grouping with terms + +Using the above data, for a non-time-series use case, the grouping can be done using a +<> based on city name. +This would work whether or not we had defined the `tour` index as a time series index. + +[source,console,id=search-aggregations-metrics-geo-line-terms] +---- +POST /tour/_search?filter_path=aggregations +{ + "aggregations": { + "path": { + "terms": {"field": "city"}, + "aggregations": { + "museum_tour": { + "geo_line": { + "point": {"field": "location"}, + "sort": {"field": "@timestamp"} + } + } + } + } + } +} +---- +// TEST[continued] + +Which returns: + +[source,js] +---- +{ + "aggregations": { + "path": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "Amsterdam", + "doc_count": 5, + "museum_tour": { + "type": "Feature", + "geometry": { + "coordinates": [ [ 4.889187, 52.373184 ], [ 4.885057, 52.370159 ], [ 4.901618, 52.369219 ], [ 4.91235, 52.374081 ], [ 4.914722, 52.371667 ] ], + "type": "LineString" + }, + "properties": { + "complete": true + } + } + }, + { + "key": "Antwerp", + "doc_count": 3, + "museum_tour": { + "type": "Feature", + "geometry": { + "coordinates": [ [ 4.401384, 51.220292 ], [ 4.405819, 51.221758 ], [ 4.4052, 51.2229 ] ], + "type": "LineString" + }, + "properties": { + "complete": true + } + } + }, + { + "key": "Paris", + "doc_count": 2, + "museum_tour": { + "type": "Feature", + "geometry": { + "coordinates": [ [ 2.336389, 48.861111 ], [ 2.327, 48.86 ] ], + "type": "LineString" + }, + "properties": { + "complete": true + } + } + } + ] + } + } +} +---- +// TESTRESPONSE + +The above results contain an array of buckets, where each bucket is a JSON object with the `key` showing the name +of the `city` field, and an inner aggregation result called `museum_tour` containing a +https://tools.ietf.org/html/rfc7946#section-3.2[GeoJSON Feature] describing the +actual route between the various attractions in that city. +Each result also includes a `properties` object with a `complete` value which will be `false` if the geometry +was truncated to the limits specified in the `size` parameter. +Note that when we use `time_series` in the example below, we will get the same results structured a little differently. + +[[search-aggregations-metrics-geo-line-grouping-time-series]] +==== Grouping with time-series + +Using the same data as before, we can also perform the grouping with a +<>. +This will group by TSID, which is defined as the combinations of all fields with `time_series_dimension: true`, +in this case the same `city` field used in the previous +<>. +This example will only work if we defined the `tour` index as a time series index using `index.mode="time_series"`. + +[source,console,id=search-aggregations-metrics-geo-line-time-series] +---- +POST /tour/_search?filter_path=aggregations +{ + "aggregations": { + "path": { + "time_series": {}, + "aggregations": { + "museum_tour": { + "geo_line": { + "point": {"field": "location"} + } + } + } + } + } +} +---- +// TEST[continued] + +NOTE: The `geo_line` aggregation no longer requires the `sort` field when nested within a +<>. +This is because the sort field is set to `@timestamp`, which all time-series indexes are pre-sorted by. +If you do set this parameter, and set it to something other than `@timestamp` you will get an error. + +The above query will result in: + +[source,js] +---- +{ + "aggregations": { + "path": { + "buckets": { + "{city=Paris}": { + "key": { + "city": "Paris" + }, + "doc_count": 2, + "museum_tour": { + "type": "Feature", + "geometry": { + "coordinates": [ [ 2.336389, 48.861111 ], [ 2.327, 48.86 ] ], + "type": "LineString" + }, + "properties": { + "complete": true + } + } + }, + "{city=Antwerp}": { + "key": { + "city": "Antwerp" + }, + "doc_count": 3, + "museum_tour": { + "type": "Feature", + "geometry": { + "coordinates": [ [ 4.401384, 51.220292 ], [ 4.405819, 51.221758 ], [ 4.4052, 51.2229 ] ], + "type": "LineString" + }, + "properties": { + "complete": true + } + } + }, + "{city=Amsterdam}": { + "key": { + "city": "Amsterdam" + }, + "doc_count": 5, + "museum_tour": { + "type": "Feature", + "geometry": { + "coordinates": [ [ 4.889187, 52.373184 ], [ 4.885057, 52.370159 ], [ 4.901618, 52.369219 ], [ 4.91235, 52.374081 ], [ 4.914722, 52.371667 ] ], + "type": "LineString" + }, + "properties": { + "complete": true + } + } + } + } + } + } +} +---- +// TESTRESPONSE + +The above results are essentially the same as with the previous `terms` aggregation example, but structured differently. +Here we see the buckets returned as a map, where the key is an internal description of the TSID. +This TSID is unique for each unique combination of fields with `time_series_dimension: true`. +Each bucket contains a `key` field which is also a map of all dimension values for the TSID, in this case only the city +name is used for grouping. +In addition, there is an inner aggregation result called `museum_tour` containing a +https://tools.ietf.org/html/rfc7946#section-3.2[GeoJSON Feature] describing the +actual route between the various attractions in that city. +Each result also includes a `properties` object with a `complete` value which will be false if the geometry +was simplified to the limits specified in the `size` parameter. + +[[search-aggregations-metrics-geo-line-grouping-time-series-advantages]] +==== Why group with time-series? + +When reviewing the above examples, you might think that there is little difference between using +<> or +<> +to group the geo-lines. However, there are some important differences in behaviour between the two cases. +Time series indexes are stored in a very specific order on disk. +They are pre-grouped by the time-series dimension fields, and pre-sorted by the `@timestamp` field. +This allows the `geo_line` aggregation to be considerably optimized: + +* The same memory allocated for the first bucket can be re-used over and over for all subsequent buckets. + This is substantially less memory than required for non-time-series cases where all buckets are collected + concurrently. +* No sorting needs to be done, since the data is pre-sorted by `@timestamp`. + The time-series data will naturally arrive at the aggregation collector in `DESC` order. + This means that if we specify `sort_order:ASC` (the default), we still collect in `DESC` order, + but perform an efficient in-memory reverse order before generating the final `LineString` geometry. +* The `size` parameter can be used for a streaming line-simplification algorithm. + Without time-series, we are forced to truncate data, by default after 10000 documents per bucket, in order to + prevent memory usage from being unbounded. + This can result in geo-lines being truncated, and therefor loosing important data. + With time-series we can run a streaming line-simplification algorithm, retaining control over memory usage, + while also maintaining the overall geometry shape. + In fact, for most use cases it would work to set this `size` parameter to a much lower bound, and save even more + memory. For example, if the `geo_line` is to be drawn on a display map with a specific resolution, it might look + just as good to simplify to as few as 100 or 200 points. This will save memory on the server, on the network and + in the client. + +Note: There are other significant advantages to working with time-series data and using `time_series` index mode. +These are discussed in the documentation on <>. + +[[search-aggregations-metrics-geo-line-simplification]] +==== Streaming line simplification + +Line simplification is a great way to reduce the size of the final results sent to the client, and displayed in a map +user interface. However, normally these algorithms use a lot of memory to perform the simplification, requiring the +entire geometry to be maintained in memory together with supporting data for the simplification itself. +The use of a streaming line simplification algorithm allows for minimal memory usage during the simplification +process by constraining memory to the bounds defined for the simplified geometry. This is only possible if no sorting +is required, which is the case when grouping is done by the +<>, +running on an index with the `time_series` index mode. + +Under these conditions the `geo_line` aggregation allocates memory to the `size` specified, and then fills that +memory with the incoming documents. +Once the memory is completely filled, documents from within the line are removed as new documents are added. +The choice of document to remove is made to minimize the visual impact on the geometry. +This process makes use of the +https://en.wikipedia.org/wiki/Visvalingam%E2%80%93Whyatt_algorithm[Visvalingam–Whyatt algorithm]. +Essentially this means points are removed if they have the minimum triangle area, with the triangle defined +by the point under consideration and the two points before and after it in the line. +In addition, we calculate the area using spherical coordinates so that no planar distortions affect the choice. + +In order to demonstrate how much better line simplification is to line truncation, consider this example of the north +shore of Kodiak Island. +The data for this is only 209 points, but if we want to set `size` to `100` we get dramatic truncation. + +image:images/spatial/kodiak_geo_line_truncated.png[North short of Kodiak Island truncated to 100 points] + +The grey line is the entire geometry of 209 points, while the blue line is the first 100 points, a very different +geometry than the original. + +Now consider the same geometry simplified to 100 points. + +image:images/spatial/kodiak_geo_line_simplified.png[North short of Kodiak Island simplified to 100 points] + +For comparison we have shown the original in grey, the truncated in blue and the new simplified geometry +in magenta. It is possible to see where the new simplified line deviates from the original, but the overall +geometry appears almost identical and is still clearly recognizable as the north shore of Kodiak Island. diff --git a/docs/reference/images/spatial/geo_line.png b/docs/reference/images/spatial/geo_line.png new file mode 100644 index 000000000000..472ff8ea363b Binary files /dev/null and b/docs/reference/images/spatial/geo_line.png differ diff --git a/docs/reference/images/spatial/kodiak_geo_line_simplified.png b/docs/reference/images/spatial/kodiak_geo_line_simplified.png new file mode 100644 index 000000000000..7e3d08dbd461 Binary files /dev/null and b/docs/reference/images/spatial/kodiak_geo_line_simplified.png differ diff --git a/docs/reference/images/spatial/kodiak_geo_line_truncated.png b/docs/reference/images/spatial/kodiak_geo_line_truncated.png new file mode 100644 index 000000000000..cacd2ebb9f1a Binary files /dev/null and b/docs/reference/images/spatial/kodiak_geo_line_truncated.png differ diff --git a/docs/reference/mapping/types/geo-point.asciidoc b/docs/reference/mapping/types/geo-point.asciidoc index 47996b8e4822..cdd874d32fb2 100644 --- a/docs/reference/mapping/types/geo-point.asciidoc +++ b/docs/reference/mapping/types/geo-point.asciidoc @@ -9,11 +9,13 @@ Fields of type `geo_point` accept latitude-longitude pairs, which can be used: * to find geopoints within a <>, within a certain <> of a central point, or within a <> (for example, points in a polygon). -* to aggregate documents by <> from a central point +* to aggregate documents by <> from a central point. * to aggregate documents by geographic grids: either <>, <> or <>. +* to aggregate geopoints into a track using the metrics aggregation + <>. * to integrate distance into a document's <>. * to <> documents by distance.