diff --git a/docs/reference/analysis/analyzers/configuring.asciidoc b/docs/reference/analysis/analyzers/configuring.asciidoc index c93d800afb91..2ce13702e002 100644 --- a/docs/reference/analysis/analyzers/configuring.asciidoc +++ b/docs/reference/analysis/analyzers/configuring.asciidoc @@ -64,3 +64,38 @@ POST my_index/_analyze English stop words will be removed. The resulting terms are: `[ old, brown, cow ]` + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "old", + "start_offset": 4, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "brown", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "cow", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 3 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + diff --git a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc index eccd16c23bef..1707a9a399b1 100644 --- a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc @@ -84,6 +84,48 @@ POST my_index/_analyze -------------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "is", + "start_offset": 0, + "end_offset": 2, + "type": "", + "position": 0 + }, + { + "token": "this", + "start_offset": 3, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "deja", + "start_offset": 11, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "vu", + "start_offset": 16, + "end_offset": 22, + "type": "", + "position": 3 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above example produces the following terms: [source,text] @@ -119,13 +161,10 @@ PUT my_index "analyzer": { "my_custom_analyzer": { "type": "custom", - "char_filter": [ "emoticons" <1> ], - "tokenizer": "punctuation", <1> - "filter": [ "lowercase", "english_stop" <1> @@ -165,11 +204,54 @@ POST my_index/_analyze "text": "I'm a :) person, and you?" } -------------------------------------------------- +// CONSOLE <1> The `emoticon` character filter, `punctuation` tokenizer and `english_stop` token filter are custom implementations which are defined in the same index settings. +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "i'm", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "_happy_", + "start_offset": 6, + "end_offset": 8, + "type": "word", + "position": 2 + }, + { + "token": "person", + "start_offset": 9, + "end_offset": 15, + "type": "word", + "position": 3 + }, + { + "token": "you", + "start_offset": 21, + "end_offset": 24, + "type": "word", + "position": 5 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above example produces the following terms: [source,text] diff --git a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc index b393c883441f..24dc92380bb0 100644 --- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc @@ -36,6 +36,27 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "and consistent godel is said sentence this yes", + "start_offset": 0, + "end_offset": 52, + "type": "fingerprint", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following single term: [source,text] @@ -58,16 +79,11 @@ The `fingerprint` analyzer accepts the following parameters: The maximum token size to emit. Defaults to `255`. Tokens larger than this size will be discarded. -`preserve_original`:: - - If `true`, emits two tokens: one with ASCII-folding of terms that contain - extended characters (if any) and one with the original characters. - Defaults to `false`. - `stopwords`:: A pre-defined stop words list like `_english_` or an array containing a list of stop words. Defaults to `_none_`. + `stopwords_path`:: The path to a file containing stop words. @@ -80,8 +96,7 @@ about stop word configuration. === Example configuration In this example, we configure the `fingerprint` analyzer to use the -pre-defined list of English stop words, and to emit a second token in -the presence of non-ASCII characters: +pre-defined list of English stop words: [source,js] ---------------------------- @@ -92,8 +107,7 @@ PUT my_index "analyzer": { "my_fingerprint_analyzer": { "type": "fingerprint", - "stopwords": "_english_", - "preserve_original": true + "stopwords": "_english_" } } } @@ -110,9 +124,30 @@ POST my_index/_analyze ---------------------------- // CONSOLE -The above example produces the following two terms: +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "consistent godel said sentence yes", + "start_offset": 0, + "end_offset": 52, + "type": "fingerprint", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following term: [source,text] --------------------------- -[ consistent godel said sentence yes, consistent gödel said sentence yes ] +[ consistent godel said sentence yes ] --------------------------- diff --git a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc index a0c1b1b0a6a0..cc94f3b757e3 100644 --- a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc @@ -25,6 +25,27 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", + "start_offset": 0, + "end_offset": 56, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following single term: [source,text] diff --git a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc index 6a4ca2744163..2d5741c2b9e3 100644 --- a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc @@ -30,6 +30,104 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "word", + "position": 1 + }, + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 2 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "word", + "position": 3 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "word", + "position": 4 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 5 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 6 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "word", + "position": 7 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 8 + }, + { + "token": "dog", + "start_offset": 45, + "end_offset": 48, + "type": "word", + "position": 9 + }, + { + "token": "s", + "start_offset": 49, + "end_offset": 50, + "type": "word", + "position": 10 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "word", + "position": 11 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following terms: [source,text] @@ -110,6 +208,55 @@ POST my_index/_analyze <1> The backslashes in the pattern need to be escaped when specifying the pattern as a JSON string. +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "john", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "smith", + "start_offset": 5, + "end_offset": 10, + "type": "word", + "position": 1 + }, + { + "token": "foo", + "start_offset": 11, + "end_offset": 14, + "type": "word", + "position": 2 + }, + { + "token": "bar", + "start_offset": 15, + "end_offset": 18, + "type": "word", + "position": 3 + }, + { + "token": "com", + "start_offset": 19, + "end_offset": 22, + "type": "word", + "position": 4 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above example produces the following terms: [source,text] @@ -148,6 +295,62 @@ GET my_index/_analyze -------------------------------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "moose", + "start_offset": 0, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "x", + "start_offset": 5, + "end_offset": 6, + "type": "word", + "position": 1 + }, + { + "token": "ftp", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 2 + }, + { + "token": "class", + "start_offset": 11, + "end_offset": 16, + "type": "word", + "position": 3 + }, + { + "token": "2", + "start_offset": 16, + "end_offset": 17, + "type": "word", + "position": 4 + }, + { + "token": "beta", + "start_offset": 18, + "end_offset": 22, + "type": "word", + "position": 5 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above example produces the following terms: [source,text] diff --git a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc index 4c932bb5d3ee..a57c30d8dd62 100644 --- a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc @@ -25,6 +25,97 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 5 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "word", + "position": 6 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog", + "start_offset": 45, + "end_offset": 48, + "type": "word", + "position": 8 + }, + { + "token": "s", + "start_offset": 49, + "end_offset": 50, + "type": "word", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "word", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following terms: [source,text] diff --git a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc index 72292e1d40da..3b948892483b 100644 --- a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc @@ -33,6 +33,97 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "", + "position": 1 + }, + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "", + "position": 6 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "", + "position": 7 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "", + "position": 8 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following terms: [source,text] @@ -98,6 +189,89 @@ POST my_index/_analyze ---------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "", + "position": 1 + }, + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "jumpe", + "start_offset": 24, + "end_offset": 29, + "type": "", + "position": 5 + }, + { + "token": "d", + "start_offset": 29, + "end_offset": 30, + "type": "", + "position": 6 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "", + "position": 7 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "", + "position": 9 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "", + "position": 10 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "", + "position": 11 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + The above example produces the following terms: [source,text] diff --git a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc index ada9022a2870..e40436342d78 100644 --- a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc @@ -29,6 +29,83 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 5 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog", + "start_offset": 45, + "end_offset": 48, + "type": "word", + "position": 8 + }, + { + "token": "s", + "start_offset": 49, + "end_offset": 50, + "type": "word", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "word", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following terms: [source,text] @@ -87,6 +164,76 @@ POST my_index/_analyze ---------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog", + "start_offset": 45, + "end_offset": 48, + "type": "word", + "position": 8 + }, + { + "token": "s", + "start_offset": 49, + "end_offset": 50, + "type": "word", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "word", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above example produces the following terms: [source,text] diff --git a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc index 0dce8db1c993..f95e5c6e4ab6 100644 --- a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc @@ -25,6 +25,90 @@ POST _analyze --------------------------- // CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "word", + "position": 1 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 2 + }, + { + "token": "Brown-Foxes", + "start_offset": 12, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 5 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "word", + "position": 6 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "word", + "position": 8 + }, + { + "token": "bone.", + "start_offset": 51, + "end_offset": 56, + "type": "word", + "position": 9 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + The above sentence would produce the following terms: [source,text] diff --git a/docs/reference/analysis/charfilters.asciidoc b/docs/reference/analysis/charfilters.asciidoc index c9f5805284cc..cd24f5bf5717 100644 --- a/docs/reference/analysis/charfilters.asciidoc +++ b/docs/reference/analysis/charfilters.asciidoc @@ -1,16 +1,36 @@ [[analysis-charfilters]] == Character Filters -Character filters are used to preprocess the string of -characters before it is passed to the <>. -A character filter may be used to strip out HTML markup, or to convert -`"&"` characters to the word `"and"`. +_Character filters_ are used to preprocess the stream of characters before it +is passed to the <>. -Elasticsearch has built in characters filters which can be -used to build <>. +A character filter receives the original text as a stream of characters and +can transform the stream by adding, removing, or changing characters. For +instance, a character filter could be used to convert Arabic numerals +(٠‎١٢٣٤٥٦٧٨‎٩‎) into their Latin equivalents (0123456789), or to strip HTML +elements like `` from the stream. -include::charfilters/mapping-charfilter.asciidoc[] + +Elasticsearch has a number of built in character filters which can be used to build +<>. + +<>:: + +The `html_strip` character filter strips out HTML elements like `` and +decodes HTML entities like `&`. + +<>:: + +The `mapping` character filter replaces any occurrences of the specified +strings with the specified replacements. + +<>:: + +The `pattern_replace` character filter replaces any characters matching a +regular expression with the specified replacement. include::charfilters/htmlstrip-charfilter.asciidoc[] +include::charfilters/mapping-charfilter.asciidoc[] + include::charfilters/pattern-replace-charfilter.asciidoc[] diff --git a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc index f12238a36ad6..3d8b187d7724 100644 --- a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc +++ b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc @@ -1,5 +1,135 @@ [[analysis-htmlstrip-charfilter]] === HTML Strip Char Filter -A char filter of type `html_strip` stripping out HTML elements from an -analyzed text. +The `html_strip` character filter strips HTML elements from the text and +replaces HTML entities with their decoded value (e.g. replacing `&` with +`&`). + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "keyword", <1> + "char_filter": [ "html_strip" ], + "text": "

I'm so happy!

" +} +--------------------------- +// CONSOLE +<1> The <> returns a single term. + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "\nI'm so happy!\n", + "start_offset": 0, + "end_offset": 32, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example returns the term: + +[source,js] +--------------------------- +[ \nI'm so happy!\n ] +--------------------------- + +The same example with the `standard` tokenizer would return the following terms: + +[source,js] +--------------------------- +[ I'm, so, happy ] +--------------------------- + +[float] +=== Configuration + +The `html_strip` character filter accepts the following parameter: + +[horizontal] +`escaped_tags`:: + + An array of HTML tags which should not be stripped from the original text. + +[float] +=== Example configuration + +In this example, we configure the `html_strip` character filter to leave `` +tags in place: + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "keyword", + "char_filter": ["my_char_filter"] + } + }, + "char_filter": { + "my_char_filter": { + "type": "html_strip", + "escaped_tags": ["b"] + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "

I'm so happy!

" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "\nI'm so happy!\n", + "start_offset": 0, + "end_offset": 32, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following term: + +[source,text] +--------------------------- +[ \nI'm so happy!\n ] +--------------------------- + + + diff --git a/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc index 14c316dcac5f..ed90e9f6ab65 100644 --- a/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc +++ b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc @@ -1,42 +1,202 @@ [[analysis-mapping-charfilter]] === Mapping Char Filter -A char filter of type `mapping` replacing characters of an analyzed text -with given mapping. +The `mapping` character filter accepts a map of keys and values. Whenever it +encounters a string of characters that is the same as a key, it replaces them +with the value associated with that key. + +Matching is greedy; the longest pattern matching at a given point wins. +Replacements are allowed to be the empty string. + +[float] +=== Configuration + +The `mapping` character filter accepts the following parameters: [horizontal] `mappings`:: - A list of mappings to use. + A array of mappings, with each element having the form `key => value`. `mappings_path`:: - A path, relative to the `config` directory, to a mappings file - configuration. + A path, either absolute or relative to the `config` directory, to a UTF-8 + encoded text mappings file containing a `key => value` mapping per line. -Here is a sample configuration: +Either the `mappings` or `mappings_path` parameter must be provided. + +[float] +=== Example configuration + +In this example, we configure the `mapping` character filter to replace Arabic +numerals with their Latin equivalents: [source,js] --------------------------------------------------- +---------------------------- +PUT my_index { - "index" : { - "analysis" : { - "char_filter" : { - "my_mapping" : { - "type" : "mapping", - "mappings" : [ - "ph => f", - "qu => k" - ] - } - }, - "analyzer" : { - "custom_with_char_filter" : { - "tokenizer" : "standard", - "char_filter" : ["my_mapping"] - } - } + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "keyword", + "char_filter": [ + "my_char_filter" + ] } + }, + "char_filter": { + "my_char_filter": { + "type": "mapping", + "mappings": [ + "٠ => 0", + "١ => 1", + "٢ => 2", + "٣ => 3", + "٤ => 4", + "٥ => 5", + "٦ => 6", + "٧ => 7", + "٨ => 8", + "٩ => 9" + ] + } + } } + } } --------------------------------------------------- + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "My license plate is ٢٥٠١٥" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "My license plate is 25015", + "start_offset": 0, + "end_offset": 25, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following term: + +[source,text] +--------------------------- +[ My license plate is 25015 ] +--------------------------- + +Keys and values can be strings with multiple characters. The following +example replaces the `:)` and `:(` emoticons with a text equivalent: + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "char_filter": [ + "my_char_filter" + ] + } + }, + "char_filter": { + "my_char_filter": { + "type": "mapping", + "mappings": [ + ":) => _happy_", + ":( => _sad_" + ] + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "I'm delighted about it :(" +} +---------------------------- +// CONSOLE + + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "I'm", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "delighted", + "start_offset": 4, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "about", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 2 + }, + { + "token": "it", + "start_offset": 20, + "end_offset": 22, + "type": "", + "position": 3 + }, + { + "token": "_sad_", + "start_offset": 23, + "end_offset": 25, + "type": "", + "position": 4 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ I'm, delighted, about, it, _sad_ ] +--------------------------- diff --git a/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc index e3b85fd7bd14..72adefa5aecf 100644 --- a/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc +++ b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc @@ -1,37 +1,249 @@ [[analysis-pattern-replace-charfilter]] === Pattern Replace Char Filter -The `pattern_replace` char filter allows the use of a regex to -manipulate the characters in a string before analysis. The regular -expression is defined using the `pattern` parameter, and the replacement -string can be provided using the `replacement` parameter (supporting -referencing the original text, as explained -http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]). -For more information check the -http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.html[lucene -documentation] +The `pattern_replace` character filter uses a regular expression to match +characters which should be replaced with the specified replacement string. +The replacement string can refer to capture groups in the regular expression. -Here is a sample configuration: +[float] +=== Configuration + +The `pattern_replace` character filter accepts the following parameters: + +[horizontal] +`pattern`:: + + A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression]. Required. + +`replacement`:: + + The replacement string, which can reference capture groups using the + `$1`..`$9` syntax, as explained + http://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#appendReplacement-java.lang.StringBuffer-java.lang.String-[here]. + +[float] +=== Example configuration + +In this example, we configure the `pattern_replace` character filter to +replace any embedded dashes in numbers with underscores, i.e `123-456-789` -> +`123_456_789`: [source,js] --------------------------------------------------- +---------------------------- +PUT my_index { - "index" : { - "analysis" : { - "char_filter" : { - "my_pattern":{ - "type":"pattern_replace", - "pattern":"sample(.*)", - "replacement":"replacedSample $1" - } - }, - "analyzer" : { - "custom_with_char_filter" : { - "tokenizer" : "standard", - "char_filter" : ["my_pattern"] - } - } + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "char_filter": [ + "my_char_filter" + ] } + }, + "char_filter": { + "my_char_filter": { + "type": "pattern_replace", + "pattern": "(\\d+)-(?=\\d)", + "replacement": "$1_" + } + } } + } } --------------------------------------------------- + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "My credit card is 123-456-789" +} +---------------------------- +// CONSOLE +// TEST[skip:Test interprets $1 as a stashed variable] + +The above example produces the following term: + +[source,text] +--------------------------- +[ My, credit, card, is 123_456_789 ] +--------------------------- + + +WARNING: Using a replacement string that changes the length of the original +text will work for search purposes, but will result in incorrect highlighting, +as can be seen in the following example. + +This example inserts a space whenever it encounters a lower-case letter +followed by an upper-case letter (i.e. `fooBarBaz` -> `foo Bar Baz`), allowing +camelCase words to be queried individually: + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "char_filter": [ + "my_char_filter" + ], + "filter": [ + "lowercase" + ] + } + }, + "char_filter": { + "my_char_filter": { + "type": "pattern_replace", + "pattern": "(?<=\\p{Lower})(?=\\p{Upper})", + "replacement": " " + } + } + } + }, + "mappings": { + "my_type": { + "properties": { + "text": { + "type": "text", + "analyzer": "my_analyzer" + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "The fooBarBaz method" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "foo", + "start_offset": 4, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "bar", + "start_offset": 7, + "end_offset": 9, + "type": "", + "position": 2 + }, + { + "token": "baz", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "method", + "start_offset": 14, + "end_offset": 20, + "type": "", + "position": 4 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above returns the following terms: + +[source,js] +---------------------------- +[ the, foo, bar, baz, method ] +---------------------------- + +Querying for `bar` will find the document correctly, but highlighting on the +result will produce incorrect highlights, because our character filter changed +the length of the original text: + +[source,js] +---------------------------- +PUT my_index/my_doc/1?refresh +{ + "text": "The fooBarBaz method" +} + +GET my_index/_search +{ + "query": { + "match": { + "text": "bar" + } + }, + "highlight": { + "fields": { + "text": {} + } + } +} +---------------------------- +// CONSOLE +// TEST[continued] + +The output from the above is: + +[source,js] +---------------------------- +{ + "timed_out": false, + "took": $body.took, + "_shards": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "hits": { + "total": 1, + "max_score": 0.4375, + "hits": [ + { + "_index": "my_index", + "_type": "my_doc", + "_id": "1", + "_score": 0.4375, + "_source": { + "text": "The fooBarBaz method" + }, + "highlight": { + "text": [ + "The fooBarBaz method" <1> + ] + } + } + ] + } +} +---------------------------- +// TESTRESPONSE[s/"took".*/"took": "$body.took",/] +<1> Note the incorrect highlight. diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index 46c02f9a4fcb..b30822b6a0b8 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -1,34 +1,136 @@ [[analysis-tokenizers]] == Tokenizers -Tokenizers are used to break a string down into a stream of terms -or tokens. A simple tokenizer might split the string up into terms -wherever it encounters whitespace or punctuation. +A _tokenizer_ receives a stream of characters, breaks it up into individual +_tokens_ (usually individual words), and outputs a stream of _tokens_. For +instance, a <> tokenizer breaks +text into tokens whenever it sees any whitespace. It would convert the text +`"Quick brown fox!"` into the terms `[Quick, brown, fox!]`. + +The tokenizer is also responsible for recording the order or _position_ of +each term (used for phrase and word proximity queries) and the start and end +_character offsets_ of the original word which the term represents (used for +highlighting search snippets). + +Elasticsearch has a number of built in tokenizers which can be used to build +<>. + +[float] +=== Word Oriented Tokenizers + +The following tokenizers are usually used for tokenizing full text into +individual words: + +<>:: + +The `standard` tokenizer divides text into terms on word boundaries, as +defined by the Unicode Text Segmentation algorithm. It removes most +punctuation symbols. It is the best choice for most languages. + +<>:: + +The `letter` tokenizer divides text into terms whenever it encounters a +character which is not a letter. + +<>:: + +The `lowercase` tokenizer, like the `letter` tokenizer, divides text into +terms whenever it encounters a character which is not a letter, but it also +lowercases all terms. + +<>:: + +The `whitespace` tokenizer divides text into terms whenever it encounters any +whitespace character. + +<>:: + +The `uax_url_email` tokenizer is like the `standard` tokenizer except that it +recognises URLs and email addresses as single tokens. + +<>:: + +The `classic` tokenizer is a grammar based tokenizer for the English Language. + +<>:: + +The `thai` tokenizer segments Thai text into words. + +[float] +=== Partial Word Tokenizers + +These tokenizers break up text or words into small fragments, for partial word +matching: + +<>:: + +The `ngram` tokenizer can break up text into words when it encounters any of +a list of specified characters (e.g. whitespace or punctuation), then it returns +n-grams of each word: a sliding window of continuous letters, e.g. `quick` -> +`[qu, ui, ic, ck]`. + +<>:: + +The `edge_ngram` tokenizer can break up text into words when it encounters any of +a list of specified characters (e.g. whitespace or punctuation), then it returns +n-grams of each word which are anchored to the start of the word, e.g. `quick` -> +`[q, qu, qui, quic, quick]`. + + +[float] +=== Structured Text Tokenizers + +The following tokenizers are usually used with structured text like +identifiers, email addresses, zip codes, and paths, rather than with full +text: + +<>:: + +The `keyword` tokenizer is a ``noop'' tokenizer that accepts whatever text it +is given and outputs the exact same text as a single term. It can be combined +with token filters like <> to +normalise the analysed terms. + +<>:: + +The `pattern` tokenizer uses a regular expression to either split text into +terms whenever it matches a word separator, or to capture matching text as +terms. + +<>:: + +The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem +path, splits on the path separator, and emits a term for each component in the +tree, e.g. `/foo/bar/baz` -> `[/foo, /foo/bar, /foo/bar/baz ]`. + + + -Elasticsearch has a number of built in tokenizers which can be -used to build <>. include::tokenizers/standard-tokenizer.asciidoc[] -include::tokenizers/edgengram-tokenizer.asciidoc[] - -include::tokenizers/keyword-tokenizer.asciidoc[] - include::tokenizers/letter-tokenizer.asciidoc[] include::tokenizers/lowercase-tokenizer.asciidoc[] -include::tokenizers/ngram-tokenizer.asciidoc[] - include::tokenizers/whitespace-tokenizer.asciidoc[] -include::tokenizers/pattern-tokenizer.asciidoc[] - include::tokenizers/uaxurlemail-tokenizer.asciidoc[] -include::tokenizers/pathhierarchy-tokenizer.asciidoc[] - include::tokenizers/classic-tokenizer.asciidoc[] include::tokenizers/thai-tokenizer.asciidoc[] + +include::tokenizers/ngram-tokenizer.asciidoc[] + +include::tokenizers/edgengram-tokenizer.asciidoc[] + + +include::tokenizers/keyword-tokenizer.asciidoc[] + +include::tokenizers/pattern-tokenizer.asciidoc[] + +include::tokenizers/pathhierarchy-tokenizer.asciidoc[] + + diff --git a/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc index 9b6315cec960..45d4ad415264 100644 --- a/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc @@ -1,19 +1,269 @@ [[analysis-classic-tokenizer]] === Classic Tokenizer -A tokenizer of type `classic` providing grammar based tokenizer that is -a good tokenizer for English language documents. This tokenizer has -heuristics for special treatment of acronyms, company names, email addresses, -and internet host names. However, these rules don't always work, and -the tokenizer doesn't work well for most languages other than English. +The `classic` tokenizer is a grammar based tokenizer that is good for English +language documents. This tokenizer has heuristics for special treatment of +acronyms, company names, email addresses, and internet host names. However, +these rules don't always work, and the tokenizer doesn't work well for most +languages other than English: + +* It splits words at most punctuation characters, removing punctuation. However, a + dot that's not followed by whitespace is considered part of a token. + +* It splits words at hyphens, unless there's a number in the token, in which case + the whole token is interpreted as a product number and is not split. + +* It recognizes email addresses and internet hostnames as one token. + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "classic", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "", + "position": 1 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "Brown", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "Foxes", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "", + "position": 6 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "", + "position": 7 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "", + "position": 8 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ] +--------------------------- + +[float] +=== Configuration + +The `classic` tokenizer accepts the following parameters: + +[horizontal] +`max_token_length`:: + + The maximum token length. If a token is seen that exceeds this length then + it is split at `max_token_length` intervals. Defaults to `255`. + +[float] +=== Example configuration + +In this example, we configure the `classic` tokenizer to have a +`max_token_length` of 5 (for demonstration purposes): + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "classic", + "max_token_length": 5 + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "", + "position": 1 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "Brown", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "Foxes", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "", + "position": 6 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "", + "position": 7 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "", + "position": 8 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ] +--------------------------- -The following are settings that can be set for a `classic` tokenizer -type: -[cols="<,<",options="header",] -|======================================================================= -|Setting |Description -|`max_token_length` |The maximum token length. If a token is seen that -exceeds this length then it is discarded. Defaults to `255`. -|======================================================================= diff --git a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc index 41cc23379408..2328354998e6 100644 --- a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc @@ -1,80 +1,323 @@ [[analysis-edgengram-tokenizer]] === Edge NGram Tokenizer -A tokenizer of type `edgeNGram`. +The `edge_ngram` tokenizer first breaks text down into words whenever it +encounters one of a list of specified characters, then it emits +https://en.wikipedia.org/wiki/N-gram[N-grams] of each word where the start of +the N-gram is anchored to the beginning of the word. -This tokenizer is very similar to `nGram` but only keeps n-grams which -start at the beginning of a token. +Edge N-Grams are useful for _search-as-you-type_ queries. -The following are settings that can be set for a `edgeNGram` tokenizer -type: +TIP: When you need _search-as-you-type_ for text which has a widely known +order, such as movie or song titles, the +<> is a much more efficient +choice than edge N-grams. Edge N-grams have the advantage when trying to +autocomplete words that can appear in any order. -[cols="<,<,<",options="header",] -|======================================================================= -|Setting |Description |Default value -|`min_gram` |Minimum size in codepoints of a single n-gram |`1`. +[float] +=== Example output -|`max_gram` |Maximum size in codepoints of a single n-gram |`2`. +With the default settings, the `edge_ngram` tokenizer treats the initial text as a +single token and produces N-grams with minimum length `1` and maximum length +`2`: -|`token_chars` | Characters classes to keep in the -tokens, Elasticsearch will split on characters that don't belong to any -of these classes. |`[]` (Keep all characters) -|======================================================================= +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "edge_ngram", + "text": "Quick Fox" +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "Q", + "start_offset": 0, + "end_offset": 1, + "type": "word", + "position": 0 + }, + { + "token": "Qu", + "start_offset": 0, + "end_offset": 2, + "type": "word", + "position": 1 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// -`token_chars` accepts the following character classes: +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ Q, Qu ] +--------------------------- + +NOTE: These default gram lengths are almost entirely useless. You need to +configure the `edge_ngram` before using it. + +[float] +=== Configuration + +The `edge_ngram` tokenizer accepts the following parameters: [horizontal] -`letter`:: for example `a`, `b`, `ï` or `京` -`digit`:: for example `3` or `7` -`whitespace`:: for example `" "` or `"\n"` -`punctuation`:: for example `!` or `"` -`symbol`:: for example `$` or `√` +`min_gram`:: + Minimum length of characters in a gram. Defaults to `1`. + +`max_gram`:: + Maximum length of characters in a gram. Defaults to `2`. + +`token_chars`:: + + Character classes that should be included in a token. Elasticsearch + will split on characters that don't belong to the classes specified. + Defaults to `[]` (keep all characters). ++ +Character classes may be any of the following: ++ +* `letter` -- for example `a`, `b`, `ï` or `京` +* `digit` -- for example `3` or `7` +* `whitespace` -- for example `" "` or `"\n"` +* `punctuation` -- for example `!` or `"` +* `symbol` -- for example `$` or `√` [float] -==== Example +=== Example configuration + +In this example, we configure the `edge_ngram` tokenizer to treat letters and +digits as tokens, and to produce grams with minimum length `2` and maximum +length `10`: [source,js] --------------------------------------------------- - curl -XPUT 'localhost:9200/test' -d ' - { - "settings" : { - "analysis" : { - "analyzer" : { - "my_edge_ngram_analyzer" : { - "tokenizer" : "my_edge_ngram_tokenizer" - } - }, - "tokenizer" : { - "my_edge_ngram_tokenizer" : { - "type" : "edgeNGram", - "min_gram" : "2", - "max_gram" : "5", - "token_chars": [ "letter", "digit" ] - } - } - } +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" } - }' + }, + "tokenizer": { + "my_tokenizer": { + "type": "edge_ngram", + "min_gram": 2, + "max_gram": 10, + "token_chars": [ + "letter", + "digit" + ] + } + } + } + } +} - curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_edge_ngram_analyzer' -d 'FC Schalke 04' - # FC, Sc, Sch, Scha, Schal, 04 --------------------------------------------------- +GET _cluster/health?wait_for_status=yellow -[float] -==== `side` deprecated +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "2 Quick Foxes." +} +---------------------------- +// CONSOLE -There used to be a `side` parameter up to `0.90.1` but it is now deprecated. In -order to emulate the behavior of `"side" : "BACK"` a -<> should be used together -with the <>. The -`edgeNGram` filter must be enclosed in `reverse` filters like this: +///////////////////// [source,js] --------------------------------------------------- - "filter" : ["reverse", "edgeNGram", "reverse"] --------------------------------------------------- +---------------------------- +{ + "tokens": [ + { + "token": "Qu", + "start_offset": 2, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Qui", + "start_offset": 2, + "end_offset": 5, + "type": "word", + "position": 1 + }, + { + "token": "Quic", + "start_offset": 2, + "end_offset": 6, + "type": "word", + "position": 2 + }, + { + "token": "Quick", + "start_offset": 2, + "end_offset": 7, + "type": "word", + "position": 3 + }, + { + "token": "Fo", + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 4 + }, + { + "token": "Fox", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 5 + }, + { + "token": "Foxe", + "start_offset": 8, + "end_offset": 12, + "type": "word", + "position": 6 + }, + { + "token": "Foxes", + "start_offset": 8, + "end_offset": 13, + "type": "word", + "position": 7 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above example produces the following terms: + +[source,text] +--------------------------- +[ Qu, Qui, Quic, Quick, Fo, Fox, Foxe, Foxes ] +--------------------------- + +Usually we recommend using the same `analyzer` at index time and at search +time. In the case of the `edge_ngram` tokenizer, the advice is different. It +only makes sense to use the `edge_ngram` tokenizer at index time, to ensure +that partial words are available for matching in the index. At search time, +just search for the terms the user has typed in, for instance: `Quick Fo`. + +Below is an example of how to set up a field for _search-as-you-type_: + +[source,js] +----------------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "autocomplete": { + "tokenizer": "autocomplete", + "filter": [ + "lowercase" + ] + }, + "autocomplete_search": { + "tokenizer": "lowercase" + } + }, + "tokenizer": { + "autocomplete": { + "type": "edge_ngram", + "min_gram": 2, + "max_gram": 10, + "token_chars": [ + "letter" + ] + } + } + } + }, + "mappings": { + "doc": { + "properties": { + "title": { + "type": "text", + "analyzer": "autocomplete", + "search_analyzer": "autocomplete_search" + } + } + } + } +} + +PUT my_index/doc/1 +{ + "title": "Quick Foxes" <1> +} + +POST my_index/_refresh + +GET my_index/_search +{ + "query": { + "match": { + "title": { + "query": "Quick Fo", <2> + "operator": "and" + } + } + } +} +----------------------------------- +// CONSOLE + +<1> The `autocomplete` analyzer indexes the terms `[qu, qui, quic, quick, fo, fox, foxe, foxes]`. +<2> The `autocomplete_search` analyzer searches for the terms `[quick, fo]`, both of which appear in the index. + +///////////////////// + +[source,js] +---------------------------- +{ + "took": $body.took, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "hits": { + "total": 1, + "max_score": 0.44194174, + "hits": [ + { + "_index": "my_index", + "_type": "doc", + "_id": "1", + "_score": 0.44194174, + "_source": { + "title": "Quick Foxes" + } + } + ] + } +} +---------------------------- +// TESTRESPONSE[s/"took".*/"took": "$body.took",/] +///////////////////// -which essentially reverses the token, builds front `EdgeNGrams` and reverses -the ngram again. This has the same effect as the previous `"side" : "BACK"` setting. diff --git a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc index ad1652466be9..27515516fe5f 100644 --- a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc @@ -1,15 +1,60 @@ [[analysis-keyword-tokenizer]] === Keyword Tokenizer -A tokenizer of type `keyword` that emits the entire input as a single -output. +The `keyword` tokenizer is a ``noop'' tokenizer that accepts whatever text it +is given and outputs the exact same text as a single term. It can be combined +with token filters to normalise output, e.g. lower-casing email addresses. -The following are settings that can be set for a `keyword` tokenizer -type: +[float] +=== Example output -[cols="<,<",options="header",] -|======================================================= -|Setting |Description -|`buffer_size` |The term buffer size. Defaults to `256`. -|======================================================= +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "keyword", + "text": "New York" +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "New York", + "start_offset": 0, + "end_offset": 8, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following term: + +[source,text] +--------------------------- +[ New York ] +--------------------------- + +[float] +=== Configuration + +The `keyword` tokenizer accepts the following parameters: + +[horizontal] +`buffer_size`:: + + The number of characters read into the term buffer in a single pass. + Defaults to `256`. The term buffer will grow by this size until all the + text has been consumed. It is advisable not to change this setting. diff --git a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc index 03025ccd3036..7423a68732d2 100644 --- a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc @@ -1,7 +1,123 @@ [[analysis-letter-tokenizer]] === Letter Tokenizer -A tokenizer of type `letter` that divides text at non-letters. That's to -say, it defines tokens as maximal strings of adjacent letters. Note, -this does a decent job for most European languages, but does a terrible -job for some Asian languages, where words are not separated by spaces. +The `letter` tokenizer breaks text into terms whenever it encounters a +character which is not a letter. It does a reasonable job for most European +languages, but does a terrible job for some Asian languages, where words are +not separated by spaces. + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "letter", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "Brown", + "start_offset": 12, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "Foxes", + "start_offset": 18, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 5 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "word", + "position": 6 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog", + "start_offset": 45, + "end_offset": 48, + "type": "word", + "position": 8 + }, + { + "token": "s", + "start_offset": 49, + "end_offset": 50, + "type": "word", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "word", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ The, QUICK, Brown, Foxes, jumped, over, the, lazy, dog, s, bone ] +--------------------------- + +[float] +=== Configuration + +The `letter` tokenizer is not configurable. diff --git a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc index 0cdbbc387a4a..5aad28b43948 100644 --- a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc @@ -1,15 +1,128 @@ [[analysis-lowercase-tokenizer]] === Lowercase Tokenizer -A tokenizer of type `lowercase` that performs the function of -<> and -<> together. It divides text at non-letters and converts -them to lower case. While it is functionally equivalent to the -combination of -<> and -<>, there is a performance advantage to doing the two -tasks at once, hence this (redundant) implementation. + +The `lowercase` toknenizer, like the +<> breaks text into terms +whenever it encounters a character which is not a letter, but it also +lowecases all terms. It is functionally equivalent to the +<> combined with the +<>, but is more +efficient as it performs both steps in a single pass. + + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "lowercase", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "quick", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "brown", + "start_offset": 12, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "foxes", + "start_offset": 18, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 5 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "word", + "position": 6 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog", + "start_offset": 45, + "end_offset": 48, + "type": "word", + "position": 8 + }, + { + "token": "s", + "start_offset": 49, + "end_offset": 50, + "type": "word", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "word", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ the, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ] +--------------------------- + +[float] +=== Configuration + +The `lowercase` tokenizer is not configurable. diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc index 23e6bc52dda7..cf45da0627e7 100644 --- a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc @@ -1,57 +1,306 @@ [[analysis-ngram-tokenizer]] === NGram Tokenizer -A tokenizer of type `nGram`. +The `ngram` tokenizer first breaks text down into words whenever it encounters +one of a list of specified characters, then it emits +https://en.wikipedia.org/wiki/N-gram[N-grams] of each word of the specified +length. -The following are settings that can be set for a `nGram` tokenizer type: - -[cols="<,<,<",options="header",] -|======================================================================= -|Setting |Description |Default value -|`min_gram` |Minimum size in codepoints of a single n-gram |`1`. - -|`max_gram` |Maximum size in codepoints of a single n-gram |`2`. - -|`token_chars` |Characters classes to keep in the -tokens, Elasticsearch will split on characters that don't belong to any -of these classes. |`[]` (Keep all characters) -|======================================================================= - -`token_chars` accepts the following character classes: - -[horizontal] -`letter`:: for example `a`, `b`, `ï` or `京` -`digit`:: for example `3` or `7` -`whitespace`:: for example `" "` or `"\n"` -`punctuation`:: for example `!` or `"` -`symbol`:: for example `$` or `√` +N-grams are like a sliding window that moves across the word - a continuous +sequence of characters of the specified length. They are useful for querying +languages that don't use spaces or that have long compound words, like German. [float] -==== Example +=== Example output + +With the default settings, the `ngram` tokenizer treats the initial text as a +single token and produces N-grams with minimum length `1` and maximum length +`2`: [source,js] --------------------------------------------------- - curl -XPUT 'localhost:9200/test' -d ' - { - "settings" : { - "analysis" : { - "analyzer" : { - "my_ngram_analyzer" : { - "tokenizer" : "my_ngram_tokenizer" - } - }, - "tokenizer" : { - "my_ngram_tokenizer" : { - "type" : "nGram", - "min_gram" : "2", - "max_gram" : "3", - "token_chars": [ "letter", "digit" ] - } - } - } - } - }' +--------------------------- +POST _analyze +{ + "tokenizer": "ngram", + "text": "Quick Fox" +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "Q", + "start_offset": 0, + "end_offset": 1, + "type": "word", + "position": 0 + }, + { + "token": "Qu", + "start_offset": 0, + "end_offset": 2, + "type": "word", + "position": 1 + }, + { + "token": "u", + "start_offset": 1, + "end_offset": 2, + "type": "word", + "position": 2 + }, + { + "token": "ui", + "start_offset": 1, + "end_offset": 3, + "type": "word", + "position": 3 + }, + { + "token": "i", + "start_offset": 2, + "end_offset": 3, + "type": "word", + "position": 4 + }, + { + "token": "ic", + "start_offset": 2, + "end_offset": 4, + "type": "word", + "position": 5 + }, + { + "token": "c", + "start_offset": 3, + "end_offset": 4, + "type": "word", + "position": 6 + }, + { + "token": "ck", + "start_offset": 3, + "end_offset": 5, + "type": "word", + "position": 7 + }, + { + "token": "k", + "start_offset": 4, + "end_offset": 5, + "type": "word", + "position": 8 + }, + { + "token": "k ", + "start_offset": 4, + "end_offset": 6, + "type": "word", + "position": 9 + }, + { + "token": " ", + "start_offset": 5, + "end_offset": 6, + "type": "word", + "position": 10 + }, + { + "token": " F", + "start_offset": 5, + "end_offset": 7, + "type": "word", + "position": 11 + }, + { + "token": "F", + "start_offset": 6, + "end_offset": 7, + "type": "word", + "position": 12 + }, + { + "token": "Fo", + "start_offset": 6, + "end_offset": 8, + "type": "word", + "position": 13 + }, + { + "token": "o", + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 14 + }, + { + "token": "ox", + "start_offset": 7, + "end_offset": 9, + "type": "word", + "position": 15 + }, + { + "token": "x", + "start_offset": 8, + "end_offset": 9, + "type": "word", + "position": 16 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ Q, Qu, u, ui, i, ic, c, ck, k, "k ", " ", " F", F, Fo, o, ox, x ] +--------------------------- + +[float] +=== Configuration + +The `ngram` tokenizer accepts the following parameters: + +[horizontal] +`min_gram`:: + Minimum length of characters in a gram. Defaults to `1`. + +`max_gram`:: + Maximum length of characters in a gram. Defaults to `2`. + +`token_chars`:: + + Character classes that should be included in a token. Elasticsearch + will split on characters that don't belong to the classes specified. + Defaults to `[]` (keep all characters). ++ +Character classes may be any of the following: ++ +* `letter` -- for example `a`, `b`, `ï` or `京` +* `digit` -- for example `3` or `7` +* `whitespace` -- for example `" "` or `"\n"` +* `punctuation` -- for example `!` or `"` +* `symbol` -- for example `$` or `√` + +TIP: It usually makes sense to set `min_gram` and `max_gram` to the same +value. The smaller the length, the more documents will match but the lower +the quality of the matches. The longer the length, the more specific the +matches. A tri-gram (length `3`) is a good place to start. + +[float] +=== Example configuration + +In this example, we configure the `ngram` tokenizer to treat letters and +digits as tokens, and to produce tri-grams (grams of length `3`): + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 3, + "token_chars": [ + "letter", + "digit" + ] + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "2 Quick Foxes." +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "Qui", + "start_offset": 2, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "uic", + "start_offset": 3, + "end_offset": 6, + "type": "word", + "position": 1 + }, + { + "token": "ick", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 2 + }, + { + "token": "Fox", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 3 + }, + { + "token": "oxe", + "start_offset": 9, + "end_offset": 12, + "type": "word", + "position": 4 + }, + { + "token": "xes", + "start_offset": 10, + "end_offset": 13, + "type": "word", + "position": 5 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ Qui, uic, ick, Fox, oxe, xes ] +--------------------------- + - curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_ngram_analyzer' -d 'FC Schalke 04' - # FC, Sc, Sch, ch, cha, ha, hal, al, alk, lk, lke, ke, 04 --------------------------------------------------- diff --git a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc index e6876f55bc66..b656e67eaec7 100644 --- a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc @@ -1,32 +1,175 @@ [[analysis-pathhierarchy-tokenizer]] === Path Hierarchy Tokenizer -The `path_hierarchy` tokenizer takes something like this: +The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem +path, splits on the path separator, and emits a term for each component in the +tree. -------------------------- -/something/something/else -------------------------- +[float] +=== Example output -And produces tokens: +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "path_hierarchy", + "text": "/one/two/three" +} +--------------------------- +// CONSOLE -------------------------- -/something -/something/something -/something/something/else -------------------------- +///////////////////// -[cols="<,<",options="header",] -|======================================================================= -|Setting |Description -|`delimiter` |The character delimiter to use, defaults to `/`. +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "/one", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "/one/two", + "start_offset": 0, + "end_offset": 8, + "type": "word", + "position": 0 + }, + { + "token": "/one/two/three", + "start_offset": 0, + "end_offset": 14, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE -|`replacement` |An optional replacement character to use. Defaults to -the `delimiter`. +///////////////////// -|`buffer_size` |The buffer size to use, defaults to `1024`. -|`reverse` |Generates tokens in reverse order, defaults to `false`. -|`skip` |Controls initial tokens to skip, defaults to `0`. -|======================================================================= +The above text would produce the following terms: + +[source,text] +--------------------------- +[ /one, /one/two, /one/two/three ] +--------------------------- + +[float] +=== Configuration + +The `path_hierarchy` tokenizer accepts the following parameters: + +[horizontal] +`delimiter`:: + The character to use as the path separator. Defaults to `/`. + +`replacement`:: + An optional replacement character to use for the delimiter. + Defaults to the `delimiter`. + +`buffer_size`:: + The number of characters read into the term buffer in a single pass. + Defaults to `1024`. The term buffer will grow by this size until all the + text has been consumed. It is advisable not to change this setting. + +`reverse`:: + If set to `true`, emits the tokens in reverse order. Defaults to `false`. + +`skip`:: + The number of initial tokens to skip. Defaults to `0`. + +[float] +=== Example configuration + +In this example, we configure the `path_hierarchy` tokenizer to split on `-` +characters, and to replace them with `/`. The first two tokens are skipped: + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "path_hierarchy", + "delimiter": "-", + "replacement": "/", + "skip": 2 + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "one-two-three-four-five" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "/three", + "start_offset": 7, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": "/three/four", + "start_offset": 7, + "end_offset": 18, + "type": "word", + "position": 0 + }, + { + "token": "/three/four/five", + "start_offset": 7, + "end_offset": 23, + "type": "word", + "position": 0 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ /three, /three/four, /three/four/five ] +--------------------------- + +If we were to set `reverse` to `true`, it would produce the following: + +[source,text] +--------------------------- +[ one/two/three/, two/three/, three/ ] +--------------------------- diff --git a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc index 9a1484561952..ca902a4e5f2e 100644 --- a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc @@ -1,38 +1,268 @@ [[analysis-pattern-tokenizer]] === Pattern Tokenizer -A tokenizer of type `pattern` that can flexibly separate text into terms -via a regular expression. Accepts the following settings: +The `pattern` tokenizer uses a regular expression to either split text into +terms whenever it matches a word separator, or to capture matching text as +terms. -[cols="<,<",options="header",] -|====================================================================== -|Setting |Description -|`pattern` |The regular expression pattern, defaults to `\W+`. -|`flags` |The regular expression flags. -|`group` |Which group to extract into tokens. Defaults to `-1` (split). -|====================================================================== +The default pattern is `\W+`, which splits text whenever it encounters +non-word characters. -*IMPORTANT*: The regular expression should match the *token separators*, -not the tokens themselves. +[float] +=== Example output -********************************************* -Note that you may need to escape `pattern` string literal according to -your client language rules. For example, in many programming languages -a string literal for `\W+` pattern is written as `"\\W+"`. -There is nothing special about `pattern` (you may have to escape other -string literals as well); escaping `pattern` is common just because it -often contains characters that should be escaped. -********************************************* +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "pattern", + "text": "The foo_bar_size's default is 5." +} +--------------------------- +// CONSOLE -`group` set to `-1` (the default) is equivalent to "split". Using group ->= 0 selects the matching group as the token. For example, if you have: +///////////////////// ------------------------- -pattern = '([^']+)' -group = 0 -input = aaa 'bbb' 'ccc' ------------------------- +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "foo_bar_size", + "start_offset": 4, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "s", + "start_offset": 17, + "end_offset": 18, + "type": "word", + "position": 2 + }, + { + "token": "default", + "start_offset": 19, + "end_offset": 26, + "type": "word", + "position": 3 + }, + { + "token": "is", + "start_offset": 27, + "end_offset": 29, + "type": "word", + "position": 4 + }, + { + "token": "5", + "start_offset": 30, + "end_offset": 31, + "type": "word", + "position": 5 + } + ] +} +---------------------------- +// TESTRESPONSE -the output will be two tokens: `'bbb'` and `'ccc'` (including the `'` -marks). With the same input but using group=1, the output would be: -`bbb` and `ccc` (no `'` marks). +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ The, foo_bar_size, s, default, is, 5 ] +--------------------------- + +[float] +=== Configuration + +The `pattern` tokenizer accepts the following parameters: + +[horizontal] +`pattern`:: + + A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression], defaults to `\W+`. + +`flags`:: + + Java regular expression http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary[flags]. + lags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`. + +`group`:: + + Which capture group to extract as tokens. Defaults to `-1` (split). + +[float] +=== Example configuration + +In this example, we configure the `pattern` tokenizer to break text into +tokens when it encounters commas: + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "pattern", + "pattern": "," + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "comma,separated,values" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "comma", + "start_offset": 0, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "separated", + "start_offset": 6, + "end_offset": 15, + "type": "word", + "position": 1 + }, + { + "token": "values", + "start_offset": 16, + "end_offset": 22, + "type": "word", + "position": 2 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ comma, separated, values ] +--------------------------- + +In the next example, we configure the `pattern` tokenizer to capture values +enclosed in double quotes (ignoring embedded escaped quotes `\"`). The regex +itself looks like this: + + "((?:\\"|[^"]|\\")*)" + +And reads as follows: + +* A literal `"` +* Start capturing: +** A literal `\"` OR any character except `"` +** Repeat until no more characters match +* A literal closing `"` + +When the pattern is specified in JSON, the `"` and `\` characters need to be +escaped, so the pattern ends up looking like: + + \"((?:\\\\\"|[^\"]|\\\\\")+)\" + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "pattern", + "pattern": "\"((?:\\\\\"|[^\"]|\\\\\")+)\"", + "group": 1 + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "\"value\", \"value with embedded \\\" quote\"" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "value", + "start_offset": 1, + "end_offset": 6, + "type": "word", + "position": 0 + }, + { + "token": "value with embedded \\\" quote", + "start_offset": 10, + "end_offset": 38, + "type": "word", + "position": 1 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above example produces the following two terms: + +[source,text] +--------------------------- +[ value, value with embedded \" quote ] +--------------------------- diff --git a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc index 42dbe5a864ac..ee052529b43d 100644 --- a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc @@ -1,18 +1,274 @@ [[analysis-standard-tokenizer]] === Standard Tokenizer -A tokenizer of type `standard` providing grammar based tokenizer that is -a good tokenizer for most European language documents. The tokenizer -implements the Unicode Text Segmentation algorithm, as specified in -http://unicode.org/reports/tr29/[Unicode Standard Annex #29]. +The `standard` tokenizer provides grammar based tokenization (based on the +Unicode Text Segmentation algorithm, as specified in +http://unicode.org/reports/tr29/[Unicode Standard Annex #29]) and works well +for most languages. + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "standard", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "", + "position": 1 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "Brown", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "Foxes", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "", + "position": 6 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "", + "position": 7 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "", + "position": 8 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "", + "position": 9 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "", + "position": 10 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ] +--------------------------- + +[float] +=== Configuration + +The `standard` tokenizer accepts the following parameters: + +[horizontal] +`max_token_length`:: + + The maximum token length. If a token is seen that exceeds this length then + it is split at `max_token_length` intervals. Defaults to `255`. + +[float] +=== Example configuration + +In this example, we configure the `standard` tokenizer to have a +`max_token_length` of 5 (for demonstration purposes): + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "standard", + "max_token_length": 5 + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "", + "position": 1 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "Brown", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "Foxes", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "jumpe", + "start_offset": 24, + "end_offset": 29, + "type": "", + "position": 5 + }, + { + "token": "d", + "start_offset": 29, + "end_offset": 30, + "type": "", + "position": 6 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "", + "position": 7 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "", + "position": 8 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "", + "position": 9 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "", + "position": 10 + }, + { + "token": "bone", + "start_offset": 51, + "end_offset": 55, + "type": "", + "position": 11 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ] +--------------------------- -The following are settings that can be set for a `standard` tokenizer -type: -[cols="<,<",options="header",] -|======================================================================= -|Setting |Description -|`max_token_length` |The maximum token length. If a token is seen that -exceeds this length then it is split at `max_token_length` intervals. Defaults to `255`. -|======================================================================= diff --git a/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc index 06f0b6892e7f..3e9904d116e3 100644 --- a/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc @@ -1,7 +1,106 @@ [[analysis-thai-tokenizer]] === Thai Tokenizer -A tokenizer of type `thai` that segments Thai text into words. This tokenizer -uses the built-in Thai segmentation algorithm included with Java to divide -up Thai text. Text in other languages in general will be treated the same -as `standard`. +The `thai` tokenizer segments Thai text into words, using the Thai +segmentation algorithm included with Java. Text in other languages in general +will be treated the same as the +<>. + +WARNING: This tokenizer may not be supported by all JREs. It is known to work +with Sun/Oracle and OpenJDK. If your application needs to be fully portable, +consider using the {plugins}/analysis-icu-tokenizer.html[ICU Tokenizer] instead. + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "thai", + "text": "การที่ได้ต้องแสดงว่างานดี" +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "การ", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "ที่", + "start_offset": 3, + "end_offset": 6, + "type": "word", + "position": 1 + }, + { + "token": "ได้", + "start_offset": 6, + "end_offset": 9, + "type": "word", + "position": 2 + }, + { + "token": "ต้อง", + "start_offset": 9, + "end_offset": 13, + "type": "word", + "position": 3 + }, + { + "token": "แสดง", + "start_offset": 13, + "end_offset": 17, + "type": "word", + "position": 4 + }, + { + "token": "ว่า", + "start_offset": 17, + "end_offset": 20, + "type": "word", + "position": 5 + }, + { + "token": "งาน", + "start_offset": 20, + "end_offset": 23, + "type": "word", + "position": 6 + }, + { + "token": "ดี", + "start_offset": 23, + "end_offset": 25, + "type": "word", + "position": 7 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ การ, ที่, ได้, ต้อง, แสดง, ว่า, งาน, ดี ] +--------------------------- + +[float] +=== Configuration + +The `thai` tokenizer is not configurable. diff --git a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc index 9ed28e60b91e..500a5e191f16 100644 --- a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc @@ -1,16 +1,199 @@ [[analysis-uaxurlemail-tokenizer]] -=== UAX Email URL Tokenizer +=== UAX URL Email Tokenizer -A tokenizer of type `uax_url_email` which works exactly like the -`standard` tokenizer, but tokenizes emails and urls as single tokens. +The `uax_url_email` tokenizer is like the <> except that it +recognises URLs and email addresses as single tokens. -The following are settings that can be set for a `uax_url_email` -tokenizer type: +[float] +=== Example output -[cols="<,<",options="header",] -|======================================================================= -|Setting |Description -|`max_token_length` |The maximum token length. If a token is seen that -exceeds this length then it is discarded. Defaults to `255`. -|======================================================================= +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "uax_url_email", + "text": "Email me at john.smith@global-international.com" +} +--------------------------- +// CONSOLE +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "Email", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "me", + "start_offset": 6, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "at", + "start_offset": 9, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "john.smith@global-international.com", + "start_offset": 12, + "end_offset": 47, + "type": "", + "position": 3 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ Email, me, at, john.smith@global-international.com ] +--------------------------- + +while the `standard` tokenizer would produce: + +[source,text] +--------------------------- +[ Email, me, at, john.smith, global, international.com ] +--------------------------- + +[float] +=== Configuration + +The `uax_url_email` tokenizer accepts the following parameters: + +[horizontal] +`max_token_length`:: + + The maximum token length. If a token is seen that exceeds this length then + it is split at `max_token_length` intervals. Defaults to `255`. + +[float] +=== Example configuration + +In this example, we configure the `uax_url_email` tokenizer to have a +`max_token_length` of 5 (for demonstration purposes): + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "uax_url_email", + "max_token_length": 5 + } + } + } + } +} + +GET _cluster/health?wait_for_status=yellow + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "john.smith@global-international.com" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "john", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "smith", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "globa", + "start_offset": 11, + "end_offset": 16, + "type": "", + "position": 2 + }, + { + "token": "l", + "start_offset": 16, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "inter", + "start_offset": 18, + "end_offset": 23, + "type": "", + "position": 4 + }, + { + "token": "natio", + "start_offset": 23, + "end_offset": 28, + "type": "", + "position": 5 + }, + { + "token": "nal.c", + "start_offset": 28, + "end_offset": 33, + "type": "", + "position": 6 + }, + { + "token": "om", + "start_offset": 33, + "end_offset": 35, + "type": "", + "position": 7 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above example produces the following terms: + +[source,text] +--------------------------- +[ john, smith, globa, l, inter, natio, nal.c, om ] +--------------------------- diff --git a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc index f0e1ce28a125..9d06ea28d55a 100644 --- a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc @@ -1,4 +1,114 @@ [[analysis-whitespace-tokenizer]] -=== Whitespace Tokenizer +=== Whitespace Analyzer -A tokenizer of type `whitespace` that divides text at whitespace. +The `whitespace` tokenizer breaks text into terms whenever it encounters a +whitespace character. + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": "whitespace", + "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone." +} +--------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "2", + "start_offset": 4, + "end_offset": 5, + "type": "word", + "position": 1 + }, + { + "token": "QUICK", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 2 + }, + { + "token": "Brown-Foxes", + "start_offset": 12, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "jumped", + "start_offset": 24, + "end_offset": 30, + "type": "word", + "position": 4 + }, + { + "token": "over", + "start_offset": 31, + "end_offset": 35, + "type": "word", + "position": 5 + }, + { + "token": "the", + "start_offset": 36, + "end_offset": 39, + "type": "word", + "position": 6 + }, + { + "token": "lazy", + "start_offset": 40, + "end_offset": 44, + "type": "word", + "position": 7 + }, + { + "token": "dog's", + "start_offset": 45, + "end_offset": 50, + "type": "word", + "position": 8 + }, + { + "token": "bone.", + "start_offset": 51, + "end_offset": 56, + "type": "word", + "position": 9 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + + +The above sentence would produce the following terms: + +[source,text] +--------------------------- +[ The, 2, QUICK, Brown-Foxes, jumped, over, the, lazy, dog's, bone. ] +--------------------------- + +[float] +=== Configuration + +The `whitespace` tokenizer is not configurable.