mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-04-25 15:47:23 -04:00
538 lines
No EOL
14 KiB
Text
538 lines
No EOL
14 KiB
Text
[role="xpack"]
|
|
[[ml-configuring-transform]]
|
|
= Altering data in your {dfeed} with runtime fields
|
|
|
|
If you use {dfeeds}, you can use runtime fields to alter your data before it
|
|
is analyzed. You can add an optional `runtime_mappings` property to your
|
|
{dfeeds}, where you can specify field types and scripts that evaluate custom
|
|
expressions without affecting the indices that you're retrieving the data from.
|
|
|
|
If your {dfeed} defines runtime fields, you can use those fields in your
|
|
{anomaly-job}. For example, you can use the runtime fields in the analysis
|
|
functions in one or more detectors. Runtime fields can impact search performance
|
|
based on the computation defined in the runtime script.
|
|
|
|
[NOTE]
|
|
===============================
|
|
Some of these examples use regular expressions. By default, regular
|
|
expressions are disabled because they circumvent the protection that Painless
|
|
provides against long running and memory hungry scripts. For more information,
|
|
see {ref}/modules-scripting-painless.html[Painless scripting language].
|
|
|
|
{ml-cap} analysis is case sensitive. For example, "John" is considered to be
|
|
different than "john". This is one reason you might consider using scripts that
|
|
convert your strings to upper or lowercase letters.
|
|
===============================
|
|
|
|
* <<ml-configuring-transform1>>
|
|
* <<ml-configuring-transform2>>
|
|
* <<ml-configuring-transform3>>
|
|
* <<ml-configuring-transform4>>
|
|
* <<ml-configuring-transform5>>
|
|
* <<ml-configuring-transform6>>
|
|
* <<ml-configuring-transform7>>
|
|
* <<ml-configuring-transform8>>
|
|
// * <<ml-configuring-transform9>>
|
|
|
|
The following index APIs create and add content to an index that is used in
|
|
subsequent examples:
|
|
|
|
[source,console]
|
|
----------------------------------
|
|
PUT /my-index-000001
|
|
{
|
|
"mappings":{
|
|
"properties": {
|
|
"@timestamp": { "type": "date" },
|
|
"aborted_count": { "type": "long" },
|
|
"another_field": { "type": "keyword" }, <1>
|
|
"clientip": { "type": "keyword" },
|
|
"coords": {
|
|
"properties": {
|
|
"lat": { "type": "keyword" },
|
|
"lon": { "type": "keyword" }
|
|
}
|
|
},
|
|
"error_count": { "type": "long" },
|
|
"query": { "type": "keyword" },
|
|
"some_field": { "type": "keyword" },
|
|
"tokenstring1":{ "type":"keyword" },
|
|
"tokenstring2":{ "type":"keyword" },
|
|
"tokenstring3":{ "type":"keyword" }
|
|
}
|
|
}
|
|
}
|
|
|
|
PUT /my-index-000001/_doc/1
|
|
{
|
|
"@timestamp":"2017-03-23T13:00:00",
|
|
"error_count":36320,
|
|
"aborted_count":4156,
|
|
"some_field":"JOE",
|
|
"another_field":"SMITH ",
|
|
"tokenstring1":"foo-bar-baz",
|
|
"tokenstring2":"foo bar baz",
|
|
"tokenstring3":"foo-bar-19",
|
|
"query":"www.ml.elastic.co",
|
|
"clientip":"123.456.78.900",
|
|
"coords": {
|
|
"lat" : 41.44,
|
|
"lon":90.5
|
|
}
|
|
}
|
|
----------------------------------
|
|
// TEST[skip:SETUP]
|
|
|
|
<1> In this example, string fields are mapped as `keyword` fields to support
|
|
aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
|
|
version of the same field, use multi-fields. For more information, see
|
|
{ref}/multi-fields.html[fields].
|
|
|
|
|
|
[[ml-configuring-transform1]]
|
|
.Example 1: Adding two numerical fields
|
|
|
|
[source,console]
|
|
----------------------------------
|
|
PUT _ml/anomaly_detectors/test1
|
|
{
|
|
"analysis_config":{
|
|
"bucket_span": "10m",
|
|
"detectors":[
|
|
{
|
|
"function":"mean",
|
|
"field_name": "total_error_count" <1>
|
|
}
|
|
]
|
|
},
|
|
"data_description": {
|
|
"time_field":"@timestamp"
|
|
},
|
|
"datafeed_config":{
|
|
"datafeed_id": "datafeed-test1",
|
|
"indices": ["my-index-000001"],
|
|
"runtime_mappings": {
|
|
"total_error_count": { <2>
|
|
"type": "long",
|
|
"script": {
|
|
"source": "emit(doc['error_count'].value + doc['aborted_count'].value)"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
----------------------------------
|
|
// TEST[skip:needs-licence]
|
|
|
|
<1> A runtime field named `total_error_count` is referenced in the detector
|
|
within the job.
|
|
<2> The runtime field is defined in the {dfeed}.
|
|
|
|
This `test1` {anomaly-job} contains a detector that uses a runtime field in a
|
|
mean analysis function. The `datafeed-test1` {dfeed} defines the runtime field.
|
|
It contains a script that adds two fields in the document to produce a "total"
|
|
error count.
|
|
|
|
The syntax for the `runtime_mappings` property is identical to that used by
|
|
{es}. For more information, see {ref}/runtime.html[Runtime fields].
|
|
|
|
You can preview the contents of the {dfeed} by using the following API:
|
|
|
|
[source,console]
|
|
----------------------------------
|
|
GET _ml/datafeeds/datafeed-test1/_preview
|
|
----------------------------------
|
|
// TEST[skip:continued]
|
|
|
|
In this example, the API returns the following results, which contain a sum of
|
|
the `error_count` and `aborted_count` values:
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"total_error_count": 40476
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
NOTE: This example demonstrates how to use runtime fields, but it contains
|
|
insufficient data to generate meaningful results.
|
|
|
|
//For a full demonstration of
|
|
//how to create jobs with sample data, see <<ml-getting-started>>.
|
|
|
|
You can alternatively use {kib} to create an advanced {anomaly-job} that uses
|
|
runtime fields. To add the `runtime_mappings` property to your {dfeed}, you must
|
|
use the **Edit JSON** tab. For example:
|
|
|
|
[role="screenshot"]
|
|
image::images/ml-runtimefields.jpg[Using runtime_mappings in {dfeed} config via {kib}]
|
|
|
|
|
|
[[ml-configuring-transform2]]
|
|
.Example 2: Concatenating strings
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
PUT _ml/anomaly_detectors/test2
|
|
{
|
|
"analysis_config":{
|
|
"bucket_span": "10m",
|
|
"detectors":[
|
|
{
|
|
"function":"low_info_content",
|
|
"field_name":"my_runtime_field" <1>
|
|
}
|
|
]
|
|
},
|
|
"data_description": {
|
|
"time_field":"@timestamp"
|
|
},
|
|
"datafeed_config":{
|
|
"datafeed_id": "datafeed-test2",
|
|
"indices": ["my-index-000001"],
|
|
"runtime_mappings": {
|
|
"my_runtime_field": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "emit(doc['some_field'].value + '_' + doc['another_field'].value)" <2>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test2/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:needs-licence]
|
|
|
|
<1> The runtime field has a generic name in this case, since it is used for
|
|
various tests in the examples.
|
|
<2> The runtime field uses the plus (+) operator to concatenate strings.
|
|
|
|
The preview {dfeed} API returns the following results, which show that "JOE"
|
|
and "SMITH " have been concatenated and an underscore was added:
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_runtime_field": "JOE_SMITH "
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
[[ml-configuring-transform3]]
|
|
.Example 3: Trimming strings
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST _ml/datafeeds/datafeed-test2/_update
|
|
{
|
|
"runtime_mappings": {
|
|
"my_runtime_field": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "emit(doc['another_field'].value.trim())" <1>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test2/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:continued]
|
|
|
|
<1> This runtime field uses the `trim()` function to trim extra white space from
|
|
a string.
|
|
|
|
The preview {dfeed} API returns the following results, which show that "SMITH "
|
|
has been trimmed to "SMITH":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_script_field": "SMITH"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
[[ml-configuring-transform4]]
|
|
.Example 4: Converting strings to lowercase
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST _ml/datafeeds/datafeed-test2/_update
|
|
{
|
|
"runtime_mappings": {
|
|
"my_runtime_field": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "emit(doc['some_field'].value.toLowerCase())" <1>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test2/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:continued]
|
|
|
|
<1> This runtime field uses the `toLowerCase` function to convert a string to
|
|
all lowercase letters. Likewise, you can use the `toUpperCase` function to
|
|
convert a string to uppercase letters.
|
|
|
|
The preview {dfeed} API returns the following results, which show that "JOE"
|
|
has been converted to "joe":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_script_field": "joe"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
[[ml-configuring-transform5]]
|
|
.Example 5: Converting strings to mixed case formats
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST _ml/datafeeds/datafeed-test2/_update
|
|
{
|
|
"runtime_mappings": {
|
|
"my_runtime_field": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "emit(doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase())" <1>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test2/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:continued]
|
|
|
|
<1> This runtime field is a more complicated example of case manipulation. It
|
|
uses the `subString()` function to capitalize the first letter of a string and
|
|
converts the remaining characters to lowercase.
|
|
|
|
The preview {dfeed} API returns the following results, which show that "JOE" has
|
|
been converted to "Joe":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_script_field": "Joe"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
[[ml-configuring-transform6]]
|
|
.Example 6: Replacing tokens
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST _ml/datafeeds/datafeed-test2/_update
|
|
{
|
|
"runtime_mappings": {
|
|
"my_runtime_field": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "emit(/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_'))" <1>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test2/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:continued]
|
|
|
|
<1> This script uses regular expressions to replace white space with
|
|
underscores.
|
|
|
|
The preview {dfeed} API returns the following results, which show that "foo bar
|
|
baz" has been converted to "foo_bar_baz":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_script_field": "foo_bar_baz"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
[[ml-configuring-transform7]]
|
|
.Example 7: Regular expression matching and concatenation
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
POST _ml/datafeeds/datafeed-test2/_update
|
|
{
|
|
"runtime_mappings": {
|
|
"my_runtime_field": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); emit(m.find() ? m.group(1) + '_' + m.group(2) : '');" <1>
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test2/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:continued]
|
|
|
|
<1> This script looks for a specific regular expression pattern and emits the
|
|
matched groups as a concatenated string. If no match is found, it emits an empty
|
|
string.
|
|
|
|
The preview {dfeed} API returns the following results, which show that
|
|
"foo-bar-19" has been converted to "foo_19":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_script_field": "foo_19"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
|
|
[[ml-configuring-transform8]]
|
|
.Example 8: Transforming geopoint data
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
PUT _ml/anomaly_detectors/test3
|
|
{
|
|
"analysis_config":{
|
|
"bucket_span": "10m",
|
|
"detectors":[
|
|
{
|
|
"function":"lat_long",
|
|
"field_name": "my_coordinates"
|
|
}
|
|
]
|
|
},
|
|
"data_description": {
|
|
"time_field":"@timestamp"
|
|
},
|
|
"datafeed_config":{
|
|
"datafeed_id": "datafeed-test3",
|
|
"indices": ["my-index-000001"],
|
|
"runtime_mappings": {
|
|
"my_coordinates": {
|
|
"type": "keyword",
|
|
"script": {
|
|
"source": "emit(doc['coords.lat'].value + ',' + doc['coords.lon'].value)"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test3/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:needs-licence]
|
|
|
|
In {es}, location data can be stored in `geo_point` fields but this data type is
|
|
not supported natively in {ml} analytics. This example of a runtime field
|
|
transforms the data into an appropriate format. For more information,
|
|
see <<ml-geo-functions>>.
|
|
|
|
The preview {dfeed} API returns the following results, which show that
|
|
`41.44` and `90.5` have been combined into "41.44,90.5":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"my_coordinates": "41.44,90.5"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
////
|
|
|
|
[[ml-configuring-transform9]]
|
|
.Example 9: Splitting strings by domain name
|
|
|
|
[source,console]
|
|
--------------------------------------------------
|
|
PUT _ml/anomaly_detectors/test4
|
|
{
|
|
"description":"DNS tunneling",
|
|
"analysis_config":{
|
|
"bucket_span": "30m",
|
|
"influencers": ["clientip","hrd"],
|
|
"detectors":[
|
|
{
|
|
"function":"high_info_content",
|
|
"field_name": "sub",
|
|
"over_field_name": "hrd",
|
|
"exclude_frequent":"all"
|
|
}
|
|
]
|
|
},
|
|
"data_description": {
|
|
"time_field":"@timestamp"
|
|
},
|
|
"datafeed_config":{
|
|
"datafeed_id": "datafeed-test4",
|
|
"indices": ["my-index-000001"],
|
|
"script_fields":{
|
|
"sub":{
|
|
"script":"return domainSplit(doc['query'].value).get(0);"
|
|
},
|
|
"hrd":{
|
|
"script":"return domainSplit(doc['query'].value).get(1);"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GET _ml/datafeeds/datafeed-test4/_preview
|
|
--------------------------------------------------
|
|
// TEST[skip:needs-licence]
|
|
|
|
If you have a single field that contains a well-formed DNS domain name, you can
|
|
use the `domainSplit()` function to split the string into its highest registered
|
|
domain and the sub-domain, which is everything to the left of the highest
|
|
registered domain. For example, the highest registered domain of
|
|
`www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
|
|
`domainSplit()` function returns an array of two values: the first value is the
|
|
subdomain; the second value is the highest registered domain.
|
|
|
|
The preview {dfeed} API returns the following results, which show that
|
|
"www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
|
|
|
|
[source,js]
|
|
----------------------------------
|
|
[
|
|
{
|
|
"@timestamp": 1490274000000,
|
|
"clientip.keyword": "123.456.78.900",
|
|
"hrd": "elastic.co",
|
|
"sub": "www.ml"
|
|
}
|
|
]
|
|
----------------------------------
|
|
|
|
//// |