Move the ingest attachment processor to the default distribution (#87989)

The ingest attachment processor is currently available as a plugin. This
commit moves the processor to the default distribution so it is always
available.
This commit is contained in:
Ryan Ernst 2022-06-27 23:10:36 -07:00 committed by GitHub
parent 1cef841170
commit eed8da3919
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
364 changed files with 70 additions and 77 deletions

View file

@ -298,16 +298,6 @@
"extended_plugins" : [ ],
"has_native_controller" : false
},
{
"name" : "ingest-attachment",
"version" : "7.10.0",
"elasticsearch_version" : "7.10.0",
"java_version" : "1.8",
"description" : "Ingest processor that uses Apache Tika to extract contents",
"classname" : "org.elasticsearch.ingest.attachment.IngestAttachmentPlugin",
"extended_plugins" : [ ],
"has_native_controller" : false
},
{
"name" : "mapper-murmur3",
"version" : "7.10.0",

View file

@ -24,7 +24,7 @@ netty = 4.1.77.Final
commons_lang3 = 3.9
# when updating this version, you need to ensure compatibility with:
# - plugins/ingest-attachment (transitive dependency, check the upstream POM)
# - modules/ingest-attachment (transitive dependency, check the upstream POM)
# - distribution/tools/plugin-cli
# - x-pack/plugin/security
bouncycastle=1.64

View file

@ -159,7 +159,12 @@ public class InstallPluginAction implements Closeable {
* maintained so that existing user workflows that install these plugins do not need to be updated
* immediately.
*/
public static final Set<String> PLUGINS_CONVERTED_TO_MODULES = Set.of("repository-azure", "repository-gcs", "repository-s3");
public static final Set<String> PLUGINS_CONVERTED_TO_MODULES = Set.of(
"repository-azure",
"repository-gcs",
"repository-s3",
"ingest-attachment"
);
static final Set<PosixFilePermission> BIN_DIR_PERMS;
static final Set<PosixFilePermission> BIN_FILES_PERMS;

View file

@ -1503,7 +1503,7 @@ public class InstallPluginActionTests extends ESTestCase {
* instead simply print a message to the terminal.
*/
public void testInstallMigratedPlugins() throws Exception {
for (String id : List.of("repository-azure", "repository-gcs", "repository-s3")) {
for (String id : List.of("repository-azure", "repository-gcs", "repository-s3", "ingest-attachment")) {
installPlugin(id);
assertThat(terminal.getErrorOutput(), containsString("[" + id + "] is no longer a plugin"));
}

View file

@ -25,9 +25,9 @@ ext.docsFileTree = fileTree(projectDir) {
// Broken code snippet tests
exclude 'reference/graph/explore.asciidoc'
if (BuildParams.inFipsJvm) {
// We don't install/support this plugin in FIPS 140
exclude 'plugins/ingest-attachment.asciidoc'
// We can't conditionally control output, this would be missing the ingest-attachment plugin
// We don't support this component in FIPS 140
exclude 'reference/ingest/processors/attachment.asciidoc'
// We can't conditionally control output, this would be missing the ingest-attachment component
exclude 'reference/cat/plugins.asciidoc'
}
}
@ -105,7 +105,7 @@ testClusters.matching { it.name == "yamlRestTest"}.configureEach {
return
}
// Do not install ingest-attachment in a FIPS 140 JVM as this is not supported
if (subproj.path.startsWith(':plugins:ingest-attachment') && BuildParams.inFipsJvm) {
if (subproj.path.startsWith(':modules:ingest-attachment') && BuildParams.inFipsJvm) {
return
}
plugin subproj.path

View file

@ -0,0 +1,5 @@
pr: 87989
summary: Move the ingest attachment processor to the default distribution
area: Ingest
type: enhancement
issues: []

View file

@ -44,8 +44,6 @@ include::analysis.asciidoc[]
include::discovery.asciidoc[]
include::ingest.asciidoc[]
include::mapper.asciidoc[]
include::repository.asciidoc[]

View file

@ -1,16 +0,0 @@
[[ingest]]
== Ingest Plugins
The ingest plugins extend Elasticsearch by providing additional ingest node capabilities.
[discrete]
=== Core Ingest Plugins
The core ingest plugins are:
<<ingest-attachment>>::
The ingest attachment plugin lets Elasticsearch extract file attachments in common formats (such as PPT, XLS, and PDF) by
using the Apache text extraction library https://tika.apache.org/[Tika].
include::ingest-attachment.asciidoc[]

View file

@ -145,3 +145,17 @@ include::redirects.asciidoc[tag=gcs-repo-migration]
=== Google Cloud Storage repository plugin
include::redirects.asciidoc[tag=gcs-repo-migration]
[role="exclude",id="ingest-attachment"]
=== Ingest Attachment plugin
// tag::ingest-attachment-migration[]
The Ingest Attachment plugin is now included in {es}.
See the {ref}/attachment.html[Ingest Attachment] processor.
// end::ingest-attachment-migration[]
[role="exclude",id="ingest"]
=== Ingest plugins
All ingest functionality has been moved into the default distribution.
For more information refer to the {ref}/processors.html[existing processors].

View file

@ -7,8 +7,8 @@
[IMPORTANT]
====
cat APIs are only intended for human consumption using the command line or {kib}
console. They are _not_ intended for use by applications. For application
cat APIs are only intended for human consumption using the command line or {kib}
console. They are _not_ intended for use by applications. For application
consumption, use the <<cluster-nodes-info,nodes info API>>.
====
@ -67,7 +67,6 @@ U7321H6 analysis-ukrainian {version_qualified} The Ukrainian Analysis plugi
U7321H6 discovery-azure-classic {version_qualified} The Azure Classic Discovery plugin allows to use Azure Classic API for the unicast discovery mechanism
U7321H6 discovery-ec2 {version_qualified} The EC2 discovery plugin allows to use AWS API for the unicast discovery mechanism.
U7321H6 discovery-gce {version_qualified} The Google Compute Engine (GCE) Discovery plugin allows to use GCE API for the unicast discovery mechanism.
U7321H6 ingest-attachment {version_qualified} Ingest processor that uses Apache Tika to extract contents
U7321H6 mapper-annotated-text {version_qualified} The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index.
U7321H6 mapper-murmur3 {version_qualified} The Mapper Murmur3 plugin allows to compute hashes of a field's values at index-time and to store them in the index.
U7321H6 mapper-size {version_qualified} The Mapper Size plugin allows document to record their uncompressed size at index time.

View file

@ -29,10 +29,11 @@ installed.
[source,yaml]
----
plugin.mandatory: ingest-attachment
plugin.mandatory: my-ingest-plugin
----
include::processors/append.asciidoc[]
include::processors/attachment.asciidoc[]
include::processors/bytes.asciidoc[]
include::processors/circle.asciidoc[]
include::processors/community-id.asciidoc[]

View file

@ -1,23 +1,21 @@
[[ingest-attachment]]
=== Ingest Attachment Processor Plugin
[[attachment]]
=== Attachment processor
++++
<titleabbrev>Attachment</titleabbrev>
++++
The ingest attachment plugin lets Elasticsearch extract file attachments in common formats (such as PPT, XLS, and PDF) by
The attachment processor lets Elasticsearch extract file attachments in common formats (such as PPT, XLS, and PDF) by
using the Apache text extraction library https://tika.apache.org/[Tika].
You can use the ingest attachment plugin as a replacement for the mapper attachment plugin.
The source field must be a base64 encoded binary. If you do not want to incur
the overhead of converting back and forth between base64, you can use the CBOR
format instead of JSON and specify the field as a bytes array instead of a string
representation. The processor will skip the base64 decoding then.
:plugin_name: ingest-attachment
include::install_remove.asciidoc[]
[[using-attachment]]
==== Using the attachment processor in a pipeline
[[using-ingest-attachment]]
==== Using the Attachment Processor in a Pipeline
[[ingest-attachment-options]]
[[attachment-options]]
.Attachment options
[options="header"]
|======
@ -33,7 +31,7 @@ include::install_remove.asciidoc[]
|======
[discrete]
[[ingest-attachment-json-ex]]
[[attachment-json-ex]]
==== Example
If attaching files to JSON documents, you must first encode the file as a base64
@ -98,7 +96,7 @@ The document's `attachment` object contains extracted properties for the file:
NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.
[[ingest-attachment-fields]]
[[attachment-fields]]
==== Exported fields
The fields which might be extracted from a document are:
@ -154,7 +152,7 @@ NOTE: Extracting contents from binary data is a resource intensive operation and
consumes a lot of resources. It is highly recommended to run pipelines
using this processor in a dedicated ingest node.
[[ingest-attachment-cbor]]
[[attachment-cbor]]
==== Use the attachment processor with CBOR
To avoid encoding and decoding JSON to base64, you can instead pass CBOR data to
@ -201,7 +199,7 @@ with open(file, 'rb') as f:
)
----
[[ingest-attachment-extracted-chars]]
[[attachment-extracted-chars]]
==== Limit the number of extracted chars
To prevent extracting too many chars and overload the node memory, the number of chars being used for extraction
@ -310,8 +308,8 @@ Returns this:
// TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/]
[[ingest-attachment-with-arrays]]
==== Using the Attachment Processor with arrays
[[attachment-with-arrays]]
==== Using the attachment processor with arrays
To use the attachment processor within an array of attachments the
{ref}/foreach-processor.html[foreach processor] is required. This

View file

@ -1867,4 +1867,4 @@ For more information about reindexing from a remote cluster, refer to
[role="exclude",id="infer-trained-model-deployment"]
=== Infer trained model deployment API
See <<infer-trained-model>>.
See <<infer-trained-model>>.

View file

@ -0,0 +1,17 @@
Apache Commons Codec
Copyright 2002-2015 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
contains test data from http://aspell.net/test/orig/batch0.tab.
Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
===============================================================================
The content of package org.apache.commons.codec.language.bm has been translated
from the original php source code available at http://stevemorse.org/phoneticinfo.htm
with permission from the original authors.
Original source copyright:
Copyright (c) 2008 Alexander Beider & Stephen P. Morse.

Some files were not shown because too many files have changed in this diff Show more