mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 09:28:55 -04:00
Move the ingest attachment processor to the default distribution (#87989)
The ingest attachment processor is currently available as a plugin. This commit moves the processor to the default distribution so it is always available.
This commit is contained in:
parent
1cef841170
commit
eed8da3919
364 changed files with 70 additions and 77 deletions
|
@ -298,16 +298,6 @@
|
||||||
"extended_plugins" : [ ],
|
"extended_plugins" : [ ],
|
||||||
"has_native_controller" : false
|
"has_native_controller" : false
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name" : "ingest-attachment",
|
|
||||||
"version" : "7.10.0",
|
|
||||||
"elasticsearch_version" : "7.10.0",
|
|
||||||
"java_version" : "1.8",
|
|
||||||
"description" : "Ingest processor that uses Apache Tika to extract contents",
|
|
||||||
"classname" : "org.elasticsearch.ingest.attachment.IngestAttachmentPlugin",
|
|
||||||
"extended_plugins" : [ ],
|
|
||||||
"has_native_controller" : false
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name" : "mapper-murmur3",
|
"name" : "mapper-murmur3",
|
||||||
"version" : "7.10.0",
|
"version" : "7.10.0",
|
||||||
|
|
|
@ -24,7 +24,7 @@ netty = 4.1.77.Final
|
||||||
commons_lang3 = 3.9
|
commons_lang3 = 3.9
|
||||||
|
|
||||||
# when updating this version, you need to ensure compatibility with:
|
# when updating this version, you need to ensure compatibility with:
|
||||||
# - plugins/ingest-attachment (transitive dependency, check the upstream POM)
|
# - modules/ingest-attachment (transitive dependency, check the upstream POM)
|
||||||
# - distribution/tools/plugin-cli
|
# - distribution/tools/plugin-cli
|
||||||
# - x-pack/plugin/security
|
# - x-pack/plugin/security
|
||||||
bouncycastle=1.64
|
bouncycastle=1.64
|
||||||
|
|
|
@ -159,7 +159,12 @@ public class InstallPluginAction implements Closeable {
|
||||||
* maintained so that existing user workflows that install these plugins do not need to be updated
|
* maintained so that existing user workflows that install these plugins do not need to be updated
|
||||||
* immediately.
|
* immediately.
|
||||||
*/
|
*/
|
||||||
public static final Set<String> PLUGINS_CONVERTED_TO_MODULES = Set.of("repository-azure", "repository-gcs", "repository-s3");
|
public static final Set<String> PLUGINS_CONVERTED_TO_MODULES = Set.of(
|
||||||
|
"repository-azure",
|
||||||
|
"repository-gcs",
|
||||||
|
"repository-s3",
|
||||||
|
"ingest-attachment"
|
||||||
|
);
|
||||||
|
|
||||||
static final Set<PosixFilePermission> BIN_DIR_PERMS;
|
static final Set<PosixFilePermission> BIN_DIR_PERMS;
|
||||||
static final Set<PosixFilePermission> BIN_FILES_PERMS;
|
static final Set<PosixFilePermission> BIN_FILES_PERMS;
|
||||||
|
|
|
@ -1503,7 +1503,7 @@ public class InstallPluginActionTests extends ESTestCase {
|
||||||
* instead simply print a message to the terminal.
|
* instead simply print a message to the terminal.
|
||||||
*/
|
*/
|
||||||
public void testInstallMigratedPlugins() throws Exception {
|
public void testInstallMigratedPlugins() throws Exception {
|
||||||
for (String id : List.of("repository-azure", "repository-gcs", "repository-s3")) {
|
for (String id : List.of("repository-azure", "repository-gcs", "repository-s3", "ingest-attachment")) {
|
||||||
installPlugin(id);
|
installPlugin(id);
|
||||||
assertThat(terminal.getErrorOutput(), containsString("[" + id + "] is no longer a plugin"));
|
assertThat(terminal.getErrorOutput(), containsString("[" + id + "] is no longer a plugin"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,9 +25,9 @@ ext.docsFileTree = fileTree(projectDir) {
|
||||||
// Broken code snippet tests
|
// Broken code snippet tests
|
||||||
exclude 'reference/graph/explore.asciidoc'
|
exclude 'reference/graph/explore.asciidoc'
|
||||||
if (BuildParams.inFipsJvm) {
|
if (BuildParams.inFipsJvm) {
|
||||||
// We don't install/support this plugin in FIPS 140
|
// We don't support this component in FIPS 140
|
||||||
exclude 'plugins/ingest-attachment.asciidoc'
|
exclude 'reference/ingest/processors/attachment.asciidoc'
|
||||||
// We can't conditionally control output, this would be missing the ingest-attachment plugin
|
// We can't conditionally control output, this would be missing the ingest-attachment component
|
||||||
exclude 'reference/cat/plugins.asciidoc'
|
exclude 'reference/cat/plugins.asciidoc'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -105,7 +105,7 @@ testClusters.matching { it.name == "yamlRestTest"}.configureEach {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Do not install ingest-attachment in a FIPS 140 JVM as this is not supported
|
// Do not install ingest-attachment in a FIPS 140 JVM as this is not supported
|
||||||
if (subproj.path.startsWith(':plugins:ingest-attachment') && BuildParams.inFipsJvm) {
|
if (subproj.path.startsWith(':modules:ingest-attachment') && BuildParams.inFipsJvm) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
plugin subproj.path
|
plugin subproj.path
|
||||||
|
|
5
docs/changelog/87989.yaml
Normal file
5
docs/changelog/87989.yaml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
pr: 87989
|
||||||
|
summary: Move the ingest attachment processor to the default distribution
|
||||||
|
area: Ingest
|
||||||
|
type: enhancement
|
||||||
|
issues: []
|
|
@ -44,8 +44,6 @@ include::analysis.asciidoc[]
|
||||||
|
|
||||||
include::discovery.asciidoc[]
|
include::discovery.asciidoc[]
|
||||||
|
|
||||||
include::ingest.asciidoc[]
|
|
||||||
|
|
||||||
include::mapper.asciidoc[]
|
include::mapper.asciidoc[]
|
||||||
|
|
||||||
include::repository.asciidoc[]
|
include::repository.asciidoc[]
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
[[ingest]]
|
|
||||||
== Ingest Plugins
|
|
||||||
|
|
||||||
The ingest plugins extend Elasticsearch by providing additional ingest node capabilities.
|
|
||||||
|
|
||||||
[discrete]
|
|
||||||
=== Core Ingest Plugins
|
|
||||||
|
|
||||||
The core ingest plugins are:
|
|
||||||
|
|
||||||
<<ingest-attachment>>::
|
|
||||||
|
|
||||||
The ingest attachment plugin lets Elasticsearch extract file attachments in common formats (such as PPT, XLS, and PDF) by
|
|
||||||
using the Apache text extraction library https://tika.apache.org/[Tika].
|
|
||||||
|
|
||||||
include::ingest-attachment.asciidoc[]
|
|
|
@ -145,3 +145,17 @@ include::redirects.asciidoc[tag=gcs-repo-migration]
|
||||||
=== Google Cloud Storage repository plugin
|
=== Google Cloud Storage repository plugin
|
||||||
|
|
||||||
include::redirects.asciidoc[tag=gcs-repo-migration]
|
include::redirects.asciidoc[tag=gcs-repo-migration]
|
||||||
|
|
||||||
|
[role="exclude",id="ingest-attachment"]
|
||||||
|
=== Ingest Attachment plugin
|
||||||
|
|
||||||
|
// tag::ingest-attachment-migration[]
|
||||||
|
The Ingest Attachment plugin is now included in {es}.
|
||||||
|
See the {ref}/attachment.html[Ingest Attachment] processor.
|
||||||
|
// end::ingest-attachment-migration[]
|
||||||
|
|
||||||
|
[role="exclude",id="ingest"]
|
||||||
|
=== Ingest plugins
|
||||||
|
|
||||||
|
All ingest functionality has been moved into the default distribution.
|
||||||
|
For more information refer to the {ref}/processors.html[existing processors].
|
||||||
|
|
|
@ -67,7 +67,6 @@ U7321H6 analysis-ukrainian {version_qualified} The Ukrainian Analysis plugi
|
||||||
U7321H6 discovery-azure-classic {version_qualified} The Azure Classic Discovery plugin allows to use Azure Classic API for the unicast discovery mechanism
|
U7321H6 discovery-azure-classic {version_qualified} The Azure Classic Discovery plugin allows to use Azure Classic API for the unicast discovery mechanism
|
||||||
U7321H6 discovery-ec2 {version_qualified} The EC2 discovery plugin allows to use AWS API for the unicast discovery mechanism.
|
U7321H6 discovery-ec2 {version_qualified} The EC2 discovery plugin allows to use AWS API for the unicast discovery mechanism.
|
||||||
U7321H6 discovery-gce {version_qualified} The Google Compute Engine (GCE) Discovery plugin allows to use GCE API for the unicast discovery mechanism.
|
U7321H6 discovery-gce {version_qualified} The Google Compute Engine (GCE) Discovery plugin allows to use GCE API for the unicast discovery mechanism.
|
||||||
U7321H6 ingest-attachment {version_qualified} Ingest processor that uses Apache Tika to extract contents
|
|
||||||
U7321H6 mapper-annotated-text {version_qualified} The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index.
|
U7321H6 mapper-annotated-text {version_qualified} The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index.
|
||||||
U7321H6 mapper-murmur3 {version_qualified} The Mapper Murmur3 plugin allows to compute hashes of a field's values at index-time and to store them in the index.
|
U7321H6 mapper-murmur3 {version_qualified} The Mapper Murmur3 plugin allows to compute hashes of a field's values at index-time and to store them in the index.
|
||||||
U7321H6 mapper-size {version_qualified} The Mapper Size plugin allows document to record their uncompressed size at index time.
|
U7321H6 mapper-size {version_qualified} The Mapper Size plugin allows document to record their uncompressed size at index time.
|
||||||
|
|
|
@ -29,10 +29,11 @@ installed.
|
||||||
|
|
||||||
[source,yaml]
|
[source,yaml]
|
||||||
----
|
----
|
||||||
plugin.mandatory: ingest-attachment
|
plugin.mandatory: my-ingest-plugin
|
||||||
----
|
----
|
||||||
|
|
||||||
include::processors/append.asciidoc[]
|
include::processors/append.asciidoc[]
|
||||||
|
include::processors/attachment.asciidoc[]
|
||||||
include::processors/bytes.asciidoc[]
|
include::processors/bytes.asciidoc[]
|
||||||
include::processors/circle.asciidoc[]
|
include::processors/circle.asciidoc[]
|
||||||
include::processors/community-id.asciidoc[]
|
include::processors/community-id.asciidoc[]
|
||||||
|
|
|
@ -1,23 +1,21 @@
|
||||||
[[ingest-attachment]]
|
[[attachment]]
|
||||||
=== Ingest Attachment Processor Plugin
|
=== Attachment processor
|
||||||
|
++++
|
||||||
|
<titleabbrev>Attachment</titleabbrev>
|
||||||
|
++++
|
||||||
|
|
||||||
The ingest attachment plugin lets Elasticsearch extract file attachments in common formats (such as PPT, XLS, and PDF) by
|
The attachment processor lets Elasticsearch extract file attachments in common formats (such as PPT, XLS, and PDF) by
|
||||||
using the Apache text extraction library https://tika.apache.org/[Tika].
|
using the Apache text extraction library https://tika.apache.org/[Tika].
|
||||||
|
|
||||||
You can use the ingest attachment plugin as a replacement for the mapper attachment plugin.
|
|
||||||
|
|
||||||
The source field must be a base64 encoded binary. If you do not want to incur
|
The source field must be a base64 encoded binary. If you do not want to incur
|
||||||
the overhead of converting back and forth between base64, you can use the CBOR
|
the overhead of converting back and forth between base64, you can use the CBOR
|
||||||
format instead of JSON and specify the field as a bytes array instead of a string
|
format instead of JSON and specify the field as a bytes array instead of a string
|
||||||
representation. The processor will skip the base64 decoding then.
|
representation. The processor will skip the base64 decoding then.
|
||||||
|
|
||||||
:plugin_name: ingest-attachment
|
[[using-attachment]]
|
||||||
include::install_remove.asciidoc[]
|
==== Using the attachment processor in a pipeline
|
||||||
|
|
||||||
[[using-ingest-attachment]]
|
[[attachment-options]]
|
||||||
==== Using the Attachment Processor in a Pipeline
|
|
||||||
|
|
||||||
[[ingest-attachment-options]]
|
|
||||||
.Attachment options
|
.Attachment options
|
||||||
[options="header"]
|
[options="header"]
|
||||||
|======
|
|======
|
||||||
|
@ -33,7 +31,7 @@ include::install_remove.asciidoc[]
|
||||||
|======
|
|======
|
||||||
|
|
||||||
[discrete]
|
[discrete]
|
||||||
[[ingest-attachment-json-ex]]
|
[[attachment-json-ex]]
|
||||||
==== Example
|
==== Example
|
||||||
|
|
||||||
If attaching files to JSON documents, you must first encode the file as a base64
|
If attaching files to JSON documents, you must first encode the file as a base64
|
||||||
|
@ -98,7 +96,7 @@ The document's `attachment` object contains extracted properties for the file:
|
||||||
NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
|
NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
|
||||||
to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.
|
to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.
|
||||||
|
|
||||||
[[ingest-attachment-fields]]
|
[[attachment-fields]]
|
||||||
==== Exported fields
|
==== Exported fields
|
||||||
|
|
||||||
The fields which might be extracted from a document are:
|
The fields which might be extracted from a document are:
|
||||||
|
@ -154,7 +152,7 @@ NOTE: Extracting contents from binary data is a resource intensive operation and
|
||||||
consumes a lot of resources. It is highly recommended to run pipelines
|
consumes a lot of resources. It is highly recommended to run pipelines
|
||||||
using this processor in a dedicated ingest node.
|
using this processor in a dedicated ingest node.
|
||||||
|
|
||||||
[[ingest-attachment-cbor]]
|
[[attachment-cbor]]
|
||||||
==== Use the attachment processor with CBOR
|
==== Use the attachment processor with CBOR
|
||||||
|
|
||||||
To avoid encoding and decoding JSON to base64, you can instead pass CBOR data to
|
To avoid encoding and decoding JSON to base64, you can instead pass CBOR data to
|
||||||
|
@ -201,7 +199,7 @@ with open(file, 'rb') as f:
|
||||||
)
|
)
|
||||||
----
|
----
|
||||||
|
|
||||||
[[ingest-attachment-extracted-chars]]
|
[[attachment-extracted-chars]]
|
||||||
==== Limit the number of extracted chars
|
==== Limit the number of extracted chars
|
||||||
|
|
||||||
To prevent extracting too many chars and overload the node memory, the number of chars being used for extraction
|
To prevent extracting too many chars and overload the node memory, the number of chars being used for extraction
|
||||||
|
@ -310,8 +308,8 @@ Returns this:
|
||||||
// TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/]
|
// TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/]
|
||||||
|
|
||||||
|
|
||||||
[[ingest-attachment-with-arrays]]
|
[[attachment-with-arrays]]
|
||||||
==== Using the Attachment Processor with arrays
|
==== Using the attachment processor with arrays
|
||||||
|
|
||||||
To use the attachment processor within an array of attachments the
|
To use the attachment processor within an array of attachments the
|
||||||
{ref}/foreach-processor.html[foreach processor] is required. This
|
{ref}/foreach-processor.html[foreach processor] is required. This
|
17
modules/ingest-attachment/licenses/commons-codec-NOTICE.txt
Normal file
17
modules/ingest-attachment/licenses/commons-codec-NOTICE.txt
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
Apache Commons Codec
|
||||||
|
Copyright 2002-2015 The Apache Software Foundation
|
||||||
|
|
||||||
|
This product includes software developed at
|
||||||
|
The Apache Software Foundation (http://www.apache.org/).
|
||||||
|
|
||||||
|
src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
|
||||||
|
contains test data from http://aspell.net/test/orig/batch0.tab.
|
||||||
|
Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
|
||||||
|
|
||||||
|
===============================================================================
|
||||||
|
|
||||||
|
The content of package org.apache.commons.codec.language.bm has been translated
|
||||||
|
from the original php source code available at http://stevemorse.org/phoneticinfo.htm
|
||||||
|
with permission from the original authors.
|
||||||
|
Original source copyright:
|
||||||
|
Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue