mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-04-25 07:37:19 -04:00
* Remove `es-test-dir` book-scoped variable * Remove `plugins-examples-dir` book-scoped variable * Remove `:dependencies-dir:` and `:xes-repo-dir:` book-scoped variables - In `index.asciidoc`, two variables (`:dependencies-dir:` and `:xes-repo-dir:`) were removed. - In `sql/index.asciidoc`, the `:sql-tests:` path was updated to fuller path - In `esql/index.asciidoc`, the `:esql-tests:` path was updated idem * Replace `es-repo-dir` with `es-ref-dir` * Move `:include-xpack: true` to few files that use it, remove from index.asciidoc
1807 lines
47 KiB
Text
1807 lines
47 KiB
Text
[role="xpack"]
|
|
[[find-structure]]
|
|
= Find text structure API
|
|
|
|
Finds the structure of text. The text must
|
|
contain data that is suitable to be ingested into the
|
|
{stack}.
|
|
|
|
[discrete]
|
|
[[find-structure-request]]
|
|
== {api-request-title}
|
|
|
|
`POST _text_structure/find_structure`
|
|
|
|
[discrete]
|
|
[[find-structure-prereqs]]
|
|
== {api-prereq-title}
|
|
|
|
* If the {es} {security-features} are enabled, you must have `monitor_text_structure` or
|
|
`monitor` cluster privileges to use this API. See
|
|
<<security-privileges>>.
|
|
|
|
[discrete]
|
|
[[find-structure-desc]]
|
|
== {api-description-title}
|
|
|
|
This API provides a starting point for ingesting data into {es} in a format that
|
|
is suitable for subsequent use with other {stack} functionality.
|
|
|
|
Unlike other {es} endpoints, the data that is posted to this endpoint does not
|
|
need to be UTF-8 encoded and in JSON format. It must, however, be text; binary
|
|
text formats are not currently supported.
|
|
|
|
The response from the API contains:
|
|
|
|
* A couple of messages from the beginning of the text.
|
|
* Statistics that reveal the most common values for all fields detected within
|
|
the text and basic numeric statistics for numeric fields.
|
|
* Information about the structure of the text, which is useful when you write
|
|
ingest configurations to index it or similarly formatted text.
|
|
* Appropriate mappings for an {es} index, which you could use to ingest the text.
|
|
|
|
All this information can be calculated by the structure finder with no guidance.
|
|
However, you can optionally override some of the decisions about the text
|
|
structure by specifying one or more query parameters.
|
|
|
|
Details of the output can be seen in the <<find-structure-examples,examples>>.
|
|
|
|
If the structure finder produces unexpected results for some text,
|
|
specify the `explain` query parameter. It causes an `explanation` to appear in
|
|
the response, which should help in determining why the returned structure was
|
|
chosen.
|
|
|
|
[discrete]
|
|
[[find-structure-query-parms]]
|
|
== {api-query-parms-title}
|
|
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-charset]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-column-names]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-delimiter]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-explain]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-format]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-grok-pattern]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-ecs-compatibility]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-has-header-row]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-line-merge-size-limit]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-lines-to-sample]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-quote]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-should-trim-fields]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-timeout]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-timestamp-field]
|
|
include::{es-ref-dir}/text-structure/apis/find-structure-shared.asciidoc[tag=param-timestamp-format]
|
|
|
|
[discrete]
|
|
[[find-structure-request-body]]
|
|
== {api-request-body-title}
|
|
|
|
The text that you want to analyze. It must contain data that is suitable to
|
|
be ingested into {es}. It does not need to be in JSON format and it does not
|
|
need to be UTF-8 encoded. The size is limited to the {es} HTTP receive buffer
|
|
size, which defaults to 100 Mb.
|
|
|
|
[discrete]
|
|
[[find-structure-examples]]
|
|
== {api-examples-title}
|
|
|
|
[discrete]
|
|
[[find-structure-example-nld-json]]
|
|
=== Ingesting newline-delimited JSON
|
|
|
|
Suppose you have newline-delimited JSON text that contains information about
|
|
some books. You can send the contents to the `find_structure` endpoint:
|
|
|
|
[source,console]
|
|
----
|
|
POST _text_structure/find_structure
|
|
{"name": "Leviathan Wakes", "author": "James S.A. Corey", "release_date": "2011-06-02", "page_count": 561}
|
|
{"name": "Hyperion", "author": "Dan Simmons", "release_date": "1989-05-26", "page_count": 482}
|
|
{"name": "Dune", "author": "Frank Herbert", "release_date": "1965-06-01", "page_count": 604}
|
|
{"name": "Dune Messiah", "author": "Frank Herbert", "release_date": "1969-10-15", "page_count": 331}
|
|
{"name": "Children of Dune", "author": "Frank Herbert", "release_date": "1976-04-21", "page_count": 408}
|
|
{"name": "God Emperor of Dune", "author": "Frank Herbert", "release_date": "1981-05-28", "page_count": 454}
|
|
{"name": "Consider Phlebas", "author": "Iain M. Banks", "release_date": "1987-04-23", "page_count": 471}
|
|
{"name": "Pandora's Star", "author": "Peter F. Hamilton", "release_date": "2004-03-02", "page_count": 768}
|
|
{"name": "Revelation Space", "author": "Alastair Reynolds", "release_date": "2000-03-15", "page_count": 585}
|
|
{"name": "A Fire Upon the Deep", "author": "Vernor Vinge", "release_date": "1992-06-01", "page_count": 613}
|
|
{"name": "Ender's Game", "author": "Orson Scott Card", "release_date": "1985-06-01", "page_count": 324}
|
|
{"name": "1984", "author": "George Orwell", "release_date": "1985-06-01", "page_count": 328}
|
|
{"name": "Fahrenheit 451", "author": "Ray Bradbury", "release_date": "1953-10-15", "page_count": 227}
|
|
{"name": "Brave New World", "author": "Aldous Huxley", "release_date": "1932-06-01", "page_count": 268}
|
|
{"name": "Foundation", "author": "Isaac Asimov", "release_date": "1951-06-01", "page_count": 224}
|
|
{"name": "The Giver", "author": "Lois Lowry", "release_date": "1993-04-26", "page_count": 208}
|
|
{"name": "Slaughterhouse-Five", "author": "Kurt Vonnegut", "release_date": "1969-06-01", "page_count": 275}
|
|
{"name": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "release_date": "1979-10-12", "page_count": 180}
|
|
{"name": "Snow Crash", "author": "Neal Stephenson", "release_date": "1992-06-01", "page_count": 470}
|
|
{"name": "Neuromancer", "author": "William Gibson", "release_date": "1984-07-01", "page_count": 271}
|
|
{"name": "The Handmaid's Tale", "author": "Margaret Atwood", "release_date": "1985-06-01", "page_count": 311}
|
|
{"name": "Starship Troopers", "author": "Robert A. Heinlein", "release_date": "1959-12-01", "page_count": 335}
|
|
{"name": "The Left Hand of Darkness", "author": "Ursula K. Le Guin", "release_date": "1969-06-01", "page_count": 304}
|
|
{"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288}
|
|
----
|
|
// TEST
|
|
|
|
If the request does not encounter errors, you receive the following result:
|
|
|
|
[source,console-result]
|
|
----
|
|
{
|
|
"num_lines_analyzed" : 24, <1>
|
|
"num_messages_analyzed" : 24, <2>
|
|
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
|
|
"charset" : "UTF-8", <4>
|
|
"has_byte_order_marker" : false, <5>
|
|
"format" : "ndjson", <6>
|
|
"ecs_compatibility" : "disabled", <7>
|
|
"timestamp_field" : "release_date", <8>
|
|
"joda_timestamp_formats" : [ <9>
|
|
"ISO8601"
|
|
],
|
|
"java_timestamp_formats" : [ <10>
|
|
"ISO8601"
|
|
],
|
|
"need_client_timezone" : true, <11>
|
|
"mappings" : { <12>
|
|
"properties" : {
|
|
"@timestamp" : {
|
|
"type" : "date"
|
|
},
|
|
"author" : {
|
|
"type" : "keyword"
|
|
},
|
|
"name" : {
|
|
"type" : "keyword"
|
|
},
|
|
"page_count" : {
|
|
"type" : "long"
|
|
},
|
|
"release_date" : {
|
|
"type" : "date",
|
|
"format" : "iso8601"
|
|
}
|
|
}
|
|
},
|
|
"ingest_pipeline" : {
|
|
"description" : "Ingest pipeline created by text structure finder",
|
|
"processors" : [
|
|
{
|
|
"date" : {
|
|
"field" : "release_date",
|
|
"timezone" : "{{ event.timezone }}",
|
|
"formats" : [
|
|
"ISO8601"
|
|
]
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"field_stats" : { <13>
|
|
"author" : {
|
|
"count" : 24,
|
|
"cardinality" : 20,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "Frank Herbert",
|
|
"count" : 4
|
|
},
|
|
{
|
|
"value" : "Robert A. Heinlein",
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : "Alastair Reynolds",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Aldous Huxley",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Dan Simmons",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Douglas Adams",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "George Orwell",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Iain M. Banks",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Isaac Asimov",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "James S.A. Corey",
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"name" : {
|
|
"count" : 24,
|
|
"cardinality" : 24,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "1984",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "A Fire Upon the Deep",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Brave New World",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Children of Dune",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Consider Phlebas",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Dune",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Dune Messiah",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Ender's Game",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Fahrenheit 451",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "Foundation",
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"page_count" : {
|
|
"count" : 24,
|
|
"cardinality" : 24,
|
|
"min_value" : 180,
|
|
"max_value" : 768,
|
|
"mean_value" : 387.0833333333333,
|
|
"median_value" : 329.5,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 180,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 208,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 224,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 227,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 268,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 271,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 275,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 288,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 304,
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : 311,
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"release_date" : {
|
|
"count" : 24,
|
|
"cardinality" : 20,
|
|
"earliest" : "1932-06-01",
|
|
"latest" : "2011-06-02",
|
|
"top_hits" : [
|
|
{
|
|
"value" : "1985-06-01",
|
|
"count" : 3
|
|
},
|
|
{
|
|
"value" : "1969-06-01",
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : "1992-06-01",
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : "1932-06-01",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "1951-06-01",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "1953-10-15",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "1959-12-01",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "1965-06-01",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "1966-04-01",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "1969-10-15",
|
|
"count" : 1
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
----
|
|
// TESTRESPONSE[s/"sample_start" : ".*",/"sample_start" : "$body.sample_start",/]
|
|
// The substitution is because the text is pre-processed by the test harness,
|
|
// so the fields may get reordered in the JSON the endpoint sees
|
|
|
|
<1> `num_lines_analyzed` indicates how many lines of the text were analyzed.
|
|
<2> `num_messages_analyzed` indicates how many distinct messages the lines
|
|
contained. For NDJSON, this value is the same as `num_lines_analyzed`. For other
|
|
text formats, messages can span several lines.
|
|
<3> `sample_start` reproduces the first two messages in the text verbatim. This
|
|
may help diagnose parse errors or accidental uploads of the wrong text.
|
|
<4> `charset` indicates the character encoding used to parse the text.
|
|
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
|
|
text begins with a byte order marker.
|
|
<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
|
|
<7> `ecs_compatibility` is either `disabled` or `v1`, defaults to `disabled`.
|
|
<8> The `timestamp_field` names the field considered most likely to be the
|
|
primary timestamp of each document.
|
|
<9> `joda_timestamp_formats` are used to tell {ls} how to parse timestamps.
|
|
<10> `java_timestamp_formats` are the Java time formats recognized in the time
|
|
fields. {es} mappings and ingest pipelines use this format.
|
|
<11> If a timestamp format is detected that does not include a timezone,
|
|
`need_client_timezone` will be `true`. The server that parses the text must
|
|
therefore be told the correct timezone by the client.
|
|
<12> `mappings` contains some suitable mappings for an index into which the data
|
|
could be ingested. In this case, the `release_date` field has been given a
|
|
`keyword` type as it is not considered specific enough to convert to the `date`
|
|
type.
|
|
<13> `field_stats` contains the most common values of each field, plus basic
|
|
numeric statistics for the numeric `page_count` field. This information may
|
|
provide clues that the data needs to be cleaned or transformed prior to use by
|
|
other {stack} functionality.
|
|
|
|
[discrete]
|
|
[[find-structure-example-nyc]]
|
|
=== Finding the structure of NYC yellow cab example data
|
|
|
|
The next example shows how it's possible to find the structure of some New York
|
|
City yellow cab trip data. The first `curl` command downloads the data, the
|
|
first 20000 lines of which are then piped into the `find_structure`
|
|
endpoint. The `lines_to_sample` query parameter of the endpoint is set to 20000
|
|
to match what is specified in the `head` command.
|
|
|
|
[source,js]
|
|
----
|
|
curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -20000 | curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_text_structure/find_structure?pretty&lines_to_sample=20000" -T -
|
|
----
|
|
// NOTCONSOLE
|
|
// Not converting to console because this shows how curl can be used
|
|
|
|
--
|
|
NOTE: The `Content-Type: application/json` header must be set even though in
|
|
this case the data is not JSON. (Alternatively the `Content-Type` can be set
|
|
to any other supported by {es}, but it must be set.)
|
|
|
|
--
|
|
|
|
If the request does not encounter errors, you receive the following result:
|
|
|
|
[source,js]
|
|
----
|
|
{
|
|
"num_lines_analyzed" : 20000,
|
|
"num_messages_analyzed" : 19998, <1>
|
|
"sample_start" : "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount\n\n1,2018-06-01 00:15:40,2018-06-01 00:16:46,1,.00,1,N,145,145,2,3,0.5,0.5,0,0,0.3,4.3\n",
|
|
"charset" : "UTF-8",
|
|
"has_byte_order_marker" : false,
|
|
"format" : "delimited", <2>
|
|
"multiline_start_pattern" : "^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}",
|
|
"exclude_lines_pattern" : "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?,\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?,\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
|
|
"column_names" : [ <3>
|
|
"VendorID",
|
|
"tpep_pickup_datetime",
|
|
"tpep_dropoff_datetime",
|
|
"passenger_count",
|
|
"trip_distance",
|
|
"RatecodeID",
|
|
"store_and_fwd_flag",
|
|
"PULocationID",
|
|
"DOLocationID",
|
|
"payment_type",
|
|
"fare_amount",
|
|
"extra",
|
|
"mta_tax",
|
|
"tip_amount",
|
|
"tolls_amount",
|
|
"improvement_surcharge",
|
|
"total_amount"
|
|
],
|
|
"has_header_row" : true, <4>
|
|
"delimiter" : ",", <5>
|
|
"quote" : "\"", <6>
|
|
"timestamp_field" : "tpep_pickup_datetime", <7>
|
|
"joda_timestamp_formats" : [ <8>
|
|
"YYYY-MM-dd HH:mm:ss"
|
|
],
|
|
"java_timestamp_formats" : [ <9>
|
|
"yyyy-MM-dd HH:mm:ss"
|
|
],
|
|
"need_client_timezone" : true, <10>
|
|
"mappings" : {
|
|
"properties" : {
|
|
"@timestamp" : {
|
|
"type" : "date"
|
|
},
|
|
"DOLocationID" : {
|
|
"type" : "long"
|
|
},
|
|
"PULocationID" : {
|
|
"type" : "long"
|
|
},
|
|
"RatecodeID" : {
|
|
"type" : "long"
|
|
},
|
|
"VendorID" : {
|
|
"type" : "long"
|
|
},
|
|
"extra" : {
|
|
"type" : "double"
|
|
},
|
|
"fare_amount" : {
|
|
"type" : "double"
|
|
},
|
|
"improvement_surcharge" : {
|
|
"type" : "double"
|
|
},
|
|
"mta_tax" : {
|
|
"type" : "double"
|
|
},
|
|
"passenger_count" : {
|
|
"type" : "long"
|
|
},
|
|
"payment_type" : {
|
|
"type" : "long"
|
|
},
|
|
"store_and_fwd_flag" : {
|
|
"type" : "keyword"
|
|
},
|
|
"tip_amount" : {
|
|
"type" : "double"
|
|
},
|
|
"tolls_amount" : {
|
|
"type" : "double"
|
|
},
|
|
"total_amount" : {
|
|
"type" : "double"
|
|
},
|
|
"tpep_dropoff_datetime" : {
|
|
"type" : "date",
|
|
"format" : "yyyy-MM-dd HH:mm:ss"
|
|
},
|
|
"tpep_pickup_datetime" : {
|
|
"type" : "date",
|
|
"format" : "yyyy-MM-dd HH:mm:ss"
|
|
},
|
|
"trip_distance" : {
|
|
"type" : "double"
|
|
}
|
|
}
|
|
},
|
|
"ingest_pipeline" : {
|
|
"description" : "Ingest pipeline created by text structure finder",
|
|
"processors" : [
|
|
{
|
|
"csv" : {
|
|
"field" : "message",
|
|
"target_fields" : [
|
|
"VendorID",
|
|
"tpep_pickup_datetime",
|
|
"tpep_dropoff_datetime",
|
|
"passenger_count",
|
|
"trip_distance",
|
|
"RatecodeID",
|
|
"store_and_fwd_flag",
|
|
"PULocationID",
|
|
"DOLocationID",
|
|
"payment_type",
|
|
"fare_amount",
|
|
"extra",
|
|
"mta_tax",
|
|
"tip_amount",
|
|
"tolls_amount",
|
|
"improvement_surcharge",
|
|
"total_amount"
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"date" : {
|
|
"field" : "tpep_pickup_datetime",
|
|
"timezone" : "{{ event.timezone }}",
|
|
"formats" : [
|
|
"yyyy-MM-dd HH:mm:ss"
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "DOLocationID",
|
|
"type" : "long"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "PULocationID",
|
|
"type" : "long"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "RatecodeID",
|
|
"type" : "long"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "VendorID",
|
|
"type" : "long"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "extra",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "fare_amount",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "improvement_surcharge",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "mta_tax",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "passenger_count",
|
|
"type" : "long"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "payment_type",
|
|
"type" : "long"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "tip_amount",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "tolls_amount",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "total_amount",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"convert" : {
|
|
"field" : "trip_distance",
|
|
"type" : "double"
|
|
}
|
|
},
|
|
{
|
|
"remove" : {
|
|
"field" : "message"
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"field_stats" : {
|
|
"DOLocationID" : {
|
|
"count" : 19998,
|
|
"cardinality" : 240,
|
|
"min_value" : 1,
|
|
"max_value" : 265,
|
|
"mean_value" : 150.26532653265312,
|
|
"median_value" : 148,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 79,
|
|
"count" : 760
|
|
},
|
|
{
|
|
"value" : 48,
|
|
"count" : 683
|
|
},
|
|
{
|
|
"value" : 68,
|
|
"count" : 529
|
|
},
|
|
{
|
|
"value" : 170,
|
|
"count" : 506
|
|
},
|
|
{
|
|
"value" : 107,
|
|
"count" : 468
|
|
},
|
|
{
|
|
"value" : 249,
|
|
"count" : 457
|
|
},
|
|
{
|
|
"value" : 230,
|
|
"count" : 441
|
|
},
|
|
{
|
|
"value" : 186,
|
|
"count" : 432
|
|
},
|
|
{
|
|
"value" : 141,
|
|
"count" : 409
|
|
},
|
|
{
|
|
"value" : 263,
|
|
"count" : 386
|
|
}
|
|
]
|
|
},
|
|
"PULocationID" : {
|
|
"count" : 19998,
|
|
"cardinality" : 154,
|
|
"min_value" : 1,
|
|
"max_value" : 265,
|
|
"mean_value" : 153.4042404240424,
|
|
"median_value" : 148,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 79,
|
|
"count" : 1067
|
|
},
|
|
{
|
|
"value" : 230,
|
|
"count" : 949
|
|
},
|
|
{
|
|
"value" : 148,
|
|
"count" : 940
|
|
},
|
|
{
|
|
"value" : 132,
|
|
"count" : 897
|
|
},
|
|
{
|
|
"value" : 48,
|
|
"count" : 853
|
|
},
|
|
{
|
|
"value" : 161,
|
|
"count" : 820
|
|
},
|
|
{
|
|
"value" : 234,
|
|
"count" : 750
|
|
},
|
|
{
|
|
"value" : 249,
|
|
"count" : 722
|
|
},
|
|
{
|
|
"value" : 164,
|
|
"count" : 663
|
|
},
|
|
{
|
|
"value" : 114,
|
|
"count" : 646
|
|
}
|
|
]
|
|
},
|
|
"RatecodeID" : {
|
|
"count" : 19998,
|
|
"cardinality" : 5,
|
|
"min_value" : 1,
|
|
"max_value" : 5,
|
|
"mean_value" : 1.0656565656565653,
|
|
"median_value" : 1,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 1,
|
|
"count" : 19311
|
|
},
|
|
{
|
|
"value" : 2,
|
|
"count" : 468
|
|
},
|
|
{
|
|
"value" : 5,
|
|
"count" : 195
|
|
},
|
|
{
|
|
"value" : 4,
|
|
"count" : 17
|
|
},
|
|
{
|
|
"value" : 3,
|
|
"count" : 7
|
|
}
|
|
]
|
|
},
|
|
"VendorID" : {
|
|
"count" : 19998,
|
|
"cardinality" : 2,
|
|
"min_value" : 1,
|
|
"max_value" : 2,
|
|
"mean_value" : 1.59005900590059,
|
|
"median_value" : 2,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 2,
|
|
"count" : 11800
|
|
},
|
|
{
|
|
"value" : 1,
|
|
"count" : 8198
|
|
}
|
|
]
|
|
},
|
|
"extra" : {
|
|
"count" : 19998,
|
|
"cardinality" : 3,
|
|
"min_value" : -0.5,
|
|
"max_value" : 0.5,
|
|
"mean_value" : 0.4815981598159816,
|
|
"median_value" : 0.5,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 0.5,
|
|
"count" : 19281
|
|
},
|
|
{
|
|
"value" : 0,
|
|
"count" : 698
|
|
},
|
|
{
|
|
"value" : -0.5,
|
|
"count" : 19
|
|
}
|
|
]
|
|
},
|
|
"fare_amount" : {
|
|
"count" : 19998,
|
|
"cardinality" : 208,
|
|
"min_value" : -100,
|
|
"max_value" : 300,
|
|
"mean_value" : 13.937719771977209,
|
|
"median_value" : 9.5,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 6,
|
|
"count" : 1004
|
|
},
|
|
{
|
|
"value" : 6.5,
|
|
"count" : 935
|
|
},
|
|
{
|
|
"value" : 5.5,
|
|
"count" : 909
|
|
},
|
|
{
|
|
"value" : 7,
|
|
"count" : 903
|
|
},
|
|
{
|
|
"value" : 5,
|
|
"count" : 889
|
|
},
|
|
{
|
|
"value" : 7.5,
|
|
"count" : 854
|
|
},
|
|
{
|
|
"value" : 4.5,
|
|
"count" : 802
|
|
},
|
|
{
|
|
"value" : 8.5,
|
|
"count" : 790
|
|
},
|
|
{
|
|
"value" : 8,
|
|
"count" : 789
|
|
},
|
|
{
|
|
"value" : 9,
|
|
"count" : 711
|
|
}
|
|
]
|
|
},
|
|
"improvement_surcharge" : {
|
|
"count" : 19998,
|
|
"cardinality" : 3,
|
|
"min_value" : -0.3,
|
|
"max_value" : 0.3,
|
|
"mean_value" : 0.29915991599159913,
|
|
"median_value" : 0.3,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 0.3,
|
|
"count" : 19964
|
|
},
|
|
{
|
|
"value" : -0.3,
|
|
"count" : 22
|
|
},
|
|
{
|
|
"value" : 0,
|
|
"count" : 12
|
|
}
|
|
]
|
|
},
|
|
"mta_tax" : {
|
|
"count" : 19998,
|
|
"cardinality" : 3,
|
|
"min_value" : -0.5,
|
|
"max_value" : 0.5,
|
|
"mean_value" : 0.4962246224622462,
|
|
"median_value" : 0.5,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 0.5,
|
|
"count" : 19868
|
|
},
|
|
{
|
|
"value" : 0,
|
|
"count" : 109
|
|
},
|
|
{
|
|
"value" : -0.5,
|
|
"count" : 21
|
|
}
|
|
]
|
|
},
|
|
"passenger_count" : {
|
|
"count" : 19998,
|
|
"cardinality" : 7,
|
|
"min_value" : 0,
|
|
"max_value" : 6,
|
|
"mean_value" : 1.6201620162016201,
|
|
"median_value" : 1,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 1,
|
|
"count" : 14219
|
|
},
|
|
{
|
|
"value" : 2,
|
|
"count" : 2886
|
|
},
|
|
{
|
|
"value" : 5,
|
|
"count" : 1047
|
|
},
|
|
{
|
|
"value" : 3,
|
|
"count" : 804
|
|
},
|
|
{
|
|
"value" : 6,
|
|
"count" : 523
|
|
},
|
|
{
|
|
"value" : 4,
|
|
"count" : 406
|
|
},
|
|
{
|
|
"value" : 0,
|
|
"count" : 113
|
|
}
|
|
]
|
|
},
|
|
"payment_type" : {
|
|
"count" : 19998,
|
|
"cardinality" : 4,
|
|
"min_value" : 1,
|
|
"max_value" : 4,
|
|
"mean_value" : 1.315631563156316,
|
|
"median_value" : 1,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 1,
|
|
"count" : 13936
|
|
},
|
|
{
|
|
"value" : 2,
|
|
"count" : 5857
|
|
},
|
|
{
|
|
"value" : 3,
|
|
"count" : 160
|
|
},
|
|
{
|
|
"value" : 4,
|
|
"count" : 45
|
|
}
|
|
]
|
|
},
|
|
"store_and_fwd_flag" : {
|
|
"count" : 19998,
|
|
"cardinality" : 2,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "N",
|
|
"count" : 19910
|
|
},
|
|
{
|
|
"value" : "Y",
|
|
"count" : 88
|
|
}
|
|
]
|
|
},
|
|
"tip_amount" : {
|
|
"count" : 19998,
|
|
"cardinality" : 717,
|
|
"min_value" : 0,
|
|
"max_value" : 128,
|
|
"mean_value" : 2.010959095909593,
|
|
"median_value" : 1.45,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 0,
|
|
"count" : 6917
|
|
},
|
|
{
|
|
"value" : 1,
|
|
"count" : 1178
|
|
},
|
|
{
|
|
"value" : 2,
|
|
"count" : 624
|
|
},
|
|
{
|
|
"value" : 3,
|
|
"count" : 248
|
|
},
|
|
{
|
|
"value" : 1.56,
|
|
"count" : 206
|
|
},
|
|
{
|
|
"value" : 1.46,
|
|
"count" : 205
|
|
},
|
|
{
|
|
"value" : 1.76,
|
|
"count" : 196
|
|
},
|
|
{
|
|
"value" : 1.45,
|
|
"count" : 195
|
|
},
|
|
{
|
|
"value" : 1.36,
|
|
"count" : 191
|
|
},
|
|
{
|
|
"value" : 1.5,
|
|
"count" : 187
|
|
}
|
|
]
|
|
},
|
|
"tolls_amount" : {
|
|
"count" : 19998,
|
|
"cardinality" : 26,
|
|
"min_value" : 0,
|
|
"max_value" : 35,
|
|
"mean_value" : 0.2729697969796978,
|
|
"median_value" : 0,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 0,
|
|
"count" : 19107
|
|
},
|
|
{
|
|
"value" : 5.76,
|
|
"count" : 791
|
|
},
|
|
{
|
|
"value" : 10.5,
|
|
"count" : 36
|
|
},
|
|
{
|
|
"value" : 2.64,
|
|
"count" : 21
|
|
},
|
|
{
|
|
"value" : 11.52,
|
|
"count" : 8
|
|
},
|
|
{
|
|
"value" : 5.54,
|
|
"count" : 4
|
|
},
|
|
{
|
|
"value" : 8.5,
|
|
"count" : 4
|
|
},
|
|
{
|
|
"value" : 17.28,
|
|
"count" : 4
|
|
},
|
|
{
|
|
"value" : 2,
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : 2.16,
|
|
"count" : 2
|
|
}
|
|
]
|
|
},
|
|
"total_amount" : {
|
|
"count" : 19998,
|
|
"cardinality" : 1267,
|
|
"min_value" : -100.3,
|
|
"max_value" : 389.12,
|
|
"mean_value" : 17.499898989898995,
|
|
"median_value" : 12.35,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 7.3,
|
|
"count" : 478
|
|
},
|
|
{
|
|
"value" : 8.3,
|
|
"count" : 443
|
|
},
|
|
{
|
|
"value" : 8.8,
|
|
"count" : 420
|
|
},
|
|
{
|
|
"value" : 6.8,
|
|
"count" : 406
|
|
},
|
|
{
|
|
"value" : 7.8,
|
|
"count" : 405
|
|
},
|
|
{
|
|
"value" : 6.3,
|
|
"count" : 371
|
|
},
|
|
{
|
|
"value" : 9.8,
|
|
"count" : 368
|
|
},
|
|
{
|
|
"value" : 5.8,
|
|
"count" : 362
|
|
},
|
|
{
|
|
"value" : 9.3,
|
|
"count" : 332
|
|
},
|
|
{
|
|
"value" : 10.3,
|
|
"count" : 332
|
|
}
|
|
]
|
|
},
|
|
"tpep_dropoff_datetime" : {
|
|
"count" : 19998,
|
|
"cardinality" : 9066,
|
|
"earliest" : "2018-05-31 06:18:15",
|
|
"latest" : "2018-06-02 02:25:44",
|
|
"top_hits" : [
|
|
{
|
|
"value" : "2018-06-01 01:12:12",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:32:15",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:44:27",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:46:42",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 01:03:22",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 01:05:13",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:11:20",
|
|
"count" : 8
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:16:03",
|
|
"count" : 8
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:19:47",
|
|
"count" : 8
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:25:17",
|
|
"count" : 8
|
|
}
|
|
]
|
|
},
|
|
"tpep_pickup_datetime" : {
|
|
"count" : 19998,
|
|
"cardinality" : 8760,
|
|
"earliest" : "2018-05-31 06:08:31",
|
|
"latest" : "2018-06-02 01:21:21",
|
|
"top_hits" : [
|
|
{
|
|
"value" : "2018-06-01 00:01:23",
|
|
"count" : 12
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:04:31",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:05:38",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:09:50",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:12:01",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:14:17",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:00:34",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:00:40",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:02:53",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-06-01 00:05:40",
|
|
"count" : 9
|
|
}
|
|
]
|
|
},
|
|
"trip_distance" : {
|
|
"count" : 19998,
|
|
"cardinality" : 1687,
|
|
"min_value" : 0,
|
|
"max_value" : 64.63,
|
|
"mean_value" : 3.6521062106210715,
|
|
"median_value" : 2.16,
|
|
"top_hits" : [
|
|
{
|
|
"value" : 0.9,
|
|
"count" : 335
|
|
},
|
|
{
|
|
"value" : 0.8,
|
|
"count" : 320
|
|
},
|
|
{
|
|
"value" : 1.1,
|
|
"count" : 316
|
|
},
|
|
{
|
|
"value" : 0.7,
|
|
"count" : 304
|
|
},
|
|
{
|
|
"value" : 1.2,
|
|
"count" : 303
|
|
},
|
|
{
|
|
"value" : 1,
|
|
"count" : 296
|
|
},
|
|
{
|
|
"value" : 1.3,
|
|
"count" : 280
|
|
},
|
|
{
|
|
"value" : 1.5,
|
|
"count" : 268
|
|
},
|
|
{
|
|
"value" : 1.6,
|
|
"count" : 268
|
|
},
|
|
{
|
|
"value" : 0.6,
|
|
"count" : 256
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
----
|
|
// NOTCONSOLE
|
|
|
|
<1> `num_messages_analyzed` is 2 lower than `num_lines_analyzed` because only
|
|
data records count as messages. The first line contains the column names and in
|
|
this sample the second line is blank.
|
|
<2> Unlike the first example, in this case the `format` has been identified as
|
|
`delimited`.
|
|
<3> Because the `format` is `delimited`, the `column_names` field in the output
|
|
lists the column names in the order they appear in the sample.
|
|
<4> `has_header_row` indicates that for this sample the column names were in
|
|
the first row of the sample. (If they hadn't been then it would have been a good
|
|
idea to specify them in the `column_names` query parameter.)
|
|
<5> The `delimiter` for this sample is a comma, as it's CSV formatted text.
|
|
<6> The `quote` character is the default double quote. (The structure finder
|
|
does not attempt to deduce any other quote character, so if you have delimited
|
|
text that's quoted with some other character you must specify it using the
|
|
`quote` query parameter.)
|
|
<7> The `timestamp_field` has been chosen to be `tpep_pickup_datetime`.
|
|
`tpep_dropoff_datetime` would work just as well, but `tpep_pickup_datetime` was
|
|
chosen because it comes first in the column order. If you prefer
|
|
`tpep_dropoff_datetime` then force it to be chosen using the
|
|
`timestamp_field` query parameter.
|
|
<8> `joda_timestamp_formats` are used to tell {ls} how to parse timestamps.
|
|
<9> `java_timestamp_formats` are the Java time formats recognized in the time
|
|
fields. {es} mappings and ingest pipelines use this format.
|
|
<10> The timestamp format in this sample doesn't specify a timezone, so to
|
|
accurately convert them to UTC timestamps to store in {es} it's necessary to
|
|
supply the timezone they relate to. `need_client_timezone` will be `false` for
|
|
timestamp formats that include the timezone.
|
|
|
|
[discrete]
|
|
[[find-structure-example-timeout]]
|
|
=== Setting the timeout parameter
|
|
|
|
If you try to analyze a lot of data then the analysis will take a long time. If
|
|
you want to limit the amount of processing your {es} cluster performs for a
|
|
request, use the `timeout` query parameter. The analysis will be aborted and an
|
|
error returned when the timeout expires. For example, you can replace 20000
|
|
lines in the previous example with 200000 and set a 1 second timeout on the
|
|
analysis:
|
|
|
|
[source,js]
|
|
----
|
|
curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -200000 | curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_text_structure/find_structure?pretty&lines_to_sample=200000&timeout=1s" -T -
|
|
----
|
|
// NOTCONSOLE
|
|
// Not converting to console because this shows how curl can be used
|
|
|
|
Unless you are using an incredibly fast computer you'll receive a timeout error:
|
|
|
|
[source,js]
|
|
----
|
|
{
|
|
"error" : {
|
|
"root_cause" : [
|
|
{
|
|
"type" : "timeout_exception",
|
|
"reason" : "Aborting structure analysis during [delimited record parsing] as it has taken longer than the timeout of [1s]"
|
|
}
|
|
],
|
|
"type" : "timeout_exception",
|
|
"reason" : "Aborting structure analysis during [delimited record parsing] as it has taken longer than the timeout of [1s]"
|
|
},
|
|
"status" : 500
|
|
}
|
|
----
|
|
// NOTCONSOLE
|
|
|
|
--
|
|
NOTE: If you try the example above yourself you will note that the overall
|
|
running time of the `curl` commands is considerably longer than 1 second. This
|
|
is because it takes a while to download 200000 lines of CSV from the internet,
|
|
and the timeout is measured from the time this endpoint starts to process the
|
|
data.
|
|
|
|
--
|
|
|
|
[discrete]
|
|
[[find-structure-example-eslog]]
|
|
=== Analyzing {es} log files
|
|
|
|
This is an example of analyzing an {es} log file:
|
|
|
|
[source,js]
|
|
----
|
|
curl -s -H "Content-Type: application/json" -XPOST
|
|
"localhost:9200/_text_structure/find_structure?pretty&ecs_compatibility=disabled" -T "$ES_HOME/logs/elasticsearch.log"
|
|
----
|
|
// NOTCONSOLE
|
|
// Not converting to console because this shows how curl can be used
|
|
|
|
If the request does not encounter errors, the result will look something like
|
|
this:
|
|
|
|
[source,js]
|
|
----
|
|
{
|
|
"num_lines_analyzed" : 53,
|
|
"num_messages_analyzed" : 53,
|
|
"sample_start" : "[2018-09-27T14:39:28,518][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], net usable_space [165.4gb], net total_space [464.7gb], types [hfs]\n[2018-09-27T14:39:28,521][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], compressed ordinary object pointers [true]\n",
|
|
"charset" : "UTF-8",
|
|
"has_byte_order_marker" : false,
|
|
"format" : "semi_structured_text", <1>
|
|
"multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", <2>
|
|
"grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel}.*", <3>
|
|
"ecs_compatibility" : "disabled", <4>
|
|
"timestamp_field" : "timestamp",
|
|
"joda_timestamp_formats" : [
|
|
"ISO8601"
|
|
],
|
|
"java_timestamp_formats" : [
|
|
"ISO8601"
|
|
],
|
|
"need_client_timezone" : true,
|
|
"mappings" : {
|
|
"properties" : {
|
|
"@timestamp" : {
|
|
"type" : "date"
|
|
},
|
|
"loglevel" : {
|
|
"type" : "keyword"
|
|
},
|
|
"message" : {
|
|
"type" : "text"
|
|
}
|
|
}
|
|
},
|
|
"ingest_pipeline" : {
|
|
"description" : "Ingest pipeline created by text structure finder",
|
|
"processors" : [
|
|
{
|
|
"grok" : {
|
|
"field" : "message",
|
|
"patterns" : [
|
|
"\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel}.*"
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"date" : {
|
|
"field" : "timestamp",
|
|
"timezone" : "{{ event.timezone }}",
|
|
"formats" : [
|
|
"ISO8601"
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"remove" : {
|
|
"field" : "timestamp"
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"field_stats" : {
|
|
"loglevel" : {
|
|
"count" : 53,
|
|
"cardinality" : 3,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "INFO",
|
|
"count" : 51
|
|
},
|
|
{
|
|
"value" : "DEBUG",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "WARN",
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"timestamp" : {
|
|
"count" : 53,
|
|
"cardinality" : 28,
|
|
"earliest" : "2018-09-27T14:39:28,518",
|
|
"latest" : "2018-09-27T14:39:37,012",
|
|
"top_hits" : [
|
|
{
|
|
"value" : "2018-09-27T14:39:29,859",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:29,860",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:29,858",
|
|
"count" : 6
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,523",
|
|
"count" : 3
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:34,234",
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,518",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,521",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,522",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:29,861",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:32,786",
|
|
"count" : 1
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
----
|
|
// NOTCONSOLE
|
|
|
|
<1> This time the `format` has been identified as `semi_structured_text`.
|
|
<2> The `multiline_start_pattern` is set on the basis that the timestamp appears
|
|
in the first line of each multi-line log message.
|
|
<3> A very simple `grok_pattern` has been created, which extracts the timestamp
|
|
and recognizable fields that appear in every analyzed message. In this case the
|
|
only field that was recognized beyond the timestamp was the log level.
|
|
<4> The ECS Grok pattern compatibility mode used, may be one of either `disabled`
|
|
(the default if not specified in the request) or `v1`
|
|
|
|
[discrete]
|
|
[[find-structure-example-grok]]
|
|
=== Specifying `grok_pattern` as query parameter
|
|
|
|
If you recognize more fields than the simple `grok_pattern` produced by the
|
|
structure finder unaided then you can resubmit the request specifying a more
|
|
advanced `grok_pattern` as a query parameter and the structure finder will
|
|
calculate `field_stats` for your additional fields.
|
|
|
|
In the case of the {es} log a more complete Grok pattern is
|
|
`\[%{TIMESTAMP_ISO8601:timestamp}\]\[%{LOGLEVEL:loglevel} *\]\[%{JAVACLASS:class} *\] \[%{HOSTNAME:node}\] %{JAVALOGMESSAGE:message}`.
|
|
You can analyze the same text again, submitting this `grok_pattern` as a
|
|
query parameter (appropriately URL escaped):
|
|
|
|
[source,js]
|
|
----
|
|
curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_text_structure/find_structure?pretty&format=semi_structured_text&grok_pattern=%5C%5B%25%7BTIMESTAMP_ISO8601:timestamp%7D%5C%5D%5C%5B%25%7BLOGLEVEL:loglevel%7D%20*%5C%5D%5C%5B%25%7BJAVACLASS:class%7D%20*%5C%5D%20%5C%5B%25%7BHOSTNAME:node%7D%5C%5D%20%25%7BJAVALOGMESSAGE:message%7D" -T "$ES_HOME/logs/elasticsearch.log"
|
|
----
|
|
// NOTCONSOLE
|
|
// Not converting to console because this shows how curl can be used
|
|
|
|
If the request does not encounter errors, the result will look something like
|
|
this:
|
|
|
|
[source,js]
|
|
----
|
|
{
|
|
"num_lines_analyzed" : 53,
|
|
"num_messages_analyzed" : 53,
|
|
"sample_start" : "[2018-09-27T14:39:28,518][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], net usable_space [165.4gb], net total_space [464.7gb], types [hfs]\n[2018-09-27T14:39:28,521][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], compressed ordinary object pointers [true]\n",
|
|
"charset" : "UTF-8",
|
|
"has_byte_order_marker" : false,
|
|
"format" : "semi_structured_text",
|
|
"multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}",
|
|
"grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", <1>
|
|
"ecs_compatibility" : "disabled", <2>
|
|
"timestamp_field" : "timestamp",
|
|
"joda_timestamp_formats" : [
|
|
"ISO8601"
|
|
],
|
|
"java_timestamp_formats" : [
|
|
"ISO8601"
|
|
],
|
|
"need_client_timezone" : true,
|
|
"mappings" : {
|
|
"properties" : {
|
|
"@timestamp" : {
|
|
"type" : "date"
|
|
},
|
|
"class" : {
|
|
"type" : "keyword"
|
|
},
|
|
"loglevel" : {
|
|
"type" : "keyword"
|
|
},
|
|
"message" : {
|
|
"type" : "text"
|
|
},
|
|
"node" : {
|
|
"type" : "keyword"
|
|
}
|
|
}
|
|
},
|
|
"ingest_pipeline" : {
|
|
"description" : "Ingest pipeline created by text structure finder",
|
|
"processors" : [
|
|
{
|
|
"grok" : {
|
|
"field" : "message",
|
|
"patterns" : [
|
|
"\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}"
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"date" : {
|
|
"field" : "timestamp",
|
|
"timezone" : "{{ event.timezone }}",
|
|
"formats" : [
|
|
"ISO8601"
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"remove" : {
|
|
"field" : "timestamp"
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"field_stats" : { <3>
|
|
"class" : {
|
|
"count" : 53,
|
|
"cardinality" : 14,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "o.e.p.PluginsService",
|
|
"count" : 26
|
|
},
|
|
{
|
|
"value" : "o.e.c.m.MetadataIndexTemplateService",
|
|
"count" : 8
|
|
},
|
|
{
|
|
"value" : "o.e.n.Node",
|
|
"count" : 7
|
|
},
|
|
{
|
|
"value" : "o.e.e.NodeEnvironment",
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : "o.e.a.ActionModule",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "o.e.c.s.ClusterApplierService",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "o.e.c.s.MasterService",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "o.e.d.DiscoveryModule",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "o.e.g.GatewayService",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "o.e.l.LicenseService",
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"loglevel" : {
|
|
"count" : 53,
|
|
"cardinality" : 3,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "INFO",
|
|
"count" : 51
|
|
},
|
|
{
|
|
"value" : "DEBUG",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "WARN",
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"message" : {
|
|
"count" : 53,
|
|
"cardinality" : 53,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "Using REST wrapper from plugin org.elasticsearch.xpack.security.Security",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.monitoring-alerts] for index patterns [.monitoring-alerts-6]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.monitoring-beats] for index patterns [.monitoring-beats-6-*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.monitoring-es] for index patterns [.monitoring-es-6-*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.monitoring-kibana] for index patterns [.monitoring-kibana-6-*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.monitoring-logstash] for index patterns [.monitoring-logstash-6-*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.triggered_watches] for index patterns [.triggered_watches*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.watch-history-9] for index patterns [.watcher-history-9*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "adding template [.watches] for index patterns [.watches*]",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "starting ...",
|
|
"count" : 1
|
|
}
|
|
]
|
|
},
|
|
"node" : {
|
|
"count" : 53,
|
|
"cardinality" : 1,
|
|
"top_hits" : [
|
|
{
|
|
"value" : "node-0",
|
|
"count" : 53
|
|
}
|
|
]
|
|
},
|
|
"timestamp" : {
|
|
"count" : 53,
|
|
"cardinality" : 28,
|
|
"earliest" : "2018-09-27T14:39:28,518",
|
|
"latest" : "2018-09-27T14:39:37,012",
|
|
"top_hits" : [
|
|
{
|
|
"value" : "2018-09-27T14:39:29,859",
|
|
"count" : 10
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:29,860",
|
|
"count" : 9
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:29,858",
|
|
"count" : 6
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,523",
|
|
"count" : 3
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:34,234",
|
|
"count" : 2
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,518",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,521",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:28,522",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:29,861",
|
|
"count" : 1
|
|
},
|
|
{
|
|
"value" : "2018-09-27T14:39:32,786",
|
|
"count" : 1
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
----
|
|
// NOTCONSOLE
|
|
|
|
<1> The `grok_pattern` in the output is now the overridden one supplied in the
|
|
query parameter.
|
|
<2> The ECS Grok pattern compatibility mode used, may be one of either `disabled`
|
|
(the default if not specified in the request) or `v1`
|
|
<3> The returned `field_stats` include entries for the fields from the
|
|
overridden `grok_pattern`.
|
|
|
|
The URL escaping is hard, so if you are working interactively it is best to use
|
|
the UI!
|