From 1b34c88d56c69590a7b954e5ea067ec4e77cb3c2 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Mon, 24 Aug 2020 12:00:44 -0400 Subject: [PATCH] [ML] adding docs + hlrc for data frame analysis feature_processors (#61149) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds HLRC and some docs for the new feature_processors field in Data frame analytics. Co-authored-by: Przemysław Witek Co-authored-by: Lisa Cawley --- .../client/ml/dataframe/Classification.java | 36 +++- .../client/ml/dataframe/Regression.java | 37 +++- .../preprocessing/OneHotEncoding.java | 2 +- .../MlClientDocumentationIT.java | 7 + .../ml/dataframe/ClassificationTests.java | 28 +++ .../DataFrameAnalyticsConfigTests.java | 2 + .../client/ml/dataframe/RegressionTests.java | 28 +++ .../ml/put-data-frame-analytics.asciidoc | 4 + .../apis/put-dfanalytics.asciidoc | 168 +++++++++--------- docs/reference/ml/ml-shared.asciidoc | 8 + 10 files changed, 229 insertions(+), 91 deletions(-) diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java index 3d9241d6ec95..d5639bda9bc1 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java @@ -18,6 +18,8 @@ */ package org.elasticsearch.client.ml.dataframe; +import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper; +import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.Strings; @@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import java.io.IOException; +import java.util.List; import java.util.Locale; import java.util.Objects; @@ -53,7 +56,9 @@ public class Classification implements DataFrameAnalysis { static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective"); static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes"); static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed"); + static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors"); + @SuppressWarnings("unchecked") private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( NAME.getPreferredName(), @@ -70,7 +75,8 @@ public class Classification implements DataFrameAnalysis { (Double) a[8], (Integer) a[9], (Long) a[10], - (ClassAssignmentObjective) a[11])); + (ClassAssignmentObjective) a[11], + (List) a[12])); static { PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE); @@ -86,6 +92,10 @@ public class Classification implements DataFrameAnalysis { PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED); PARSER.declareString( ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE); + PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), + (p, c, n) -> p.namedObject(PreProcessor.class, n, c), + (classification) -> {}, + FEATURE_PROCESSORS); } private final String dependentVariable; @@ -100,12 +110,13 @@ public class Classification implements DataFrameAnalysis { private final ClassAssignmentObjective classAssignmentObjective; private final Integer numTopClasses; private final Long randomizeSeed; + private final List featureProcessors; private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta, @Nullable Integer maxTrees, @Nullable Double featureBagFraction, @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName, @Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed, - @Nullable ClassAssignmentObjective classAssignmentObjective) { + @Nullable ClassAssignmentObjective classAssignmentObjective, @Nullable List featureProcessors) { this.dependentVariable = Objects.requireNonNull(dependentVariable); this.lambda = lambda; this.gamma = gamma; @@ -118,6 +129,7 @@ public class Classification implements DataFrameAnalysis { this.classAssignmentObjective = classAssignmentObjective; this.numTopClasses = numTopClasses; this.randomizeSeed = randomizeSeed; + this.featureProcessors = featureProcessors; } @Override @@ -173,6 +185,10 @@ public class Classification implements DataFrameAnalysis { return numTopClasses; } + public List getFeatureProcessors() { + return featureProcessors; + } + @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); @@ -210,6 +226,9 @@ public class Classification implements DataFrameAnalysis { if (numTopClasses != null) { builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses); } + if (featureProcessors != null) { + NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors); + } builder.endObject(); return builder; } @@ -217,7 +236,7 @@ public class Classification implements DataFrameAnalysis { @Override public int hashCode() { return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues, - predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective); + predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective, featureProcessors); } @Override @@ -236,7 +255,8 @@ public class Classification implements DataFrameAnalysis { && Objects.equals(trainingPercent, that.trainingPercent) && Objects.equals(randomizeSeed, that.randomizeSeed) && Objects.equals(numTopClasses, that.numTopClasses) - && Objects.equals(classAssignmentObjective, that.classAssignmentObjective); + && Objects.equals(classAssignmentObjective, that.classAssignmentObjective) + && Objects.equals(featureProcessors, that.featureProcessors); } @Override @@ -270,6 +290,7 @@ public class Classification implements DataFrameAnalysis { private Integer numTopClasses; private Long randomizeSeed; private ClassAssignmentObjective classAssignmentObjective; + private List featureProcessors; private Builder(String dependentVariable) { this.dependentVariable = Objects.requireNonNull(dependentVariable); @@ -330,10 +351,15 @@ public class Classification implements DataFrameAnalysis { return this; } + public Builder setFeatureProcessors(List featureProcessors) { + this.featureProcessors = featureProcessors; + return this; + } + public Classification build() { return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed, - classAssignmentObjective); + classAssignmentObjective, featureProcessors); } } } diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java index e897dde198e0..6e9f89700139 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java @@ -18,6 +18,8 @@ */ package org.elasticsearch.client.ml.dataframe; +import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper; +import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.Strings; @@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import java.io.IOException; +import java.util.List; import java.util.Locale; import java.util.Objects; @@ -55,7 +58,9 @@ public class Regression implements DataFrameAnalysis { static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed"); static final ParseField LOSS_FUNCTION = new ParseField("loss_function"); static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter"); + static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors"); + @SuppressWarnings("unchecked") private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( NAME.getPreferredName(), @@ -72,7 +77,8 @@ public class Regression implements DataFrameAnalysis { (Double) a[8], (Long) a[9], (LossFunction) a[10], - (Double) a[11] + (Double) a[11], + (List) a[12] )); static { @@ -88,6 +94,10 @@ public class Regression implements DataFrameAnalysis { PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED); PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER); + PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), + (p, c, n) -> p.namedObject(PreProcessor.class, n, c), + (regression) -> {}, + FEATURE_PROCESSORS); } private final String dependentVariable; @@ -102,12 +112,13 @@ public class Regression implements DataFrameAnalysis { private final Long randomizeSeed; private final LossFunction lossFunction; private final Double lossFunctionParameter; + private final List featureProcessors; private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta, @Nullable Integer maxTrees, @Nullable Double featureBagFraction, @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName, @Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction, - @Nullable Double lossFunctionParameter) { + @Nullable Double lossFunctionParameter, @Nullable List featureProcessors) { this.dependentVariable = Objects.requireNonNull(dependentVariable); this.lambda = lambda; this.gamma = gamma; @@ -120,6 +131,7 @@ public class Regression implements DataFrameAnalysis { this.randomizeSeed = randomizeSeed; this.lossFunction = lossFunction; this.lossFunctionParameter = lossFunctionParameter; + this.featureProcessors = featureProcessors; } @Override @@ -175,6 +187,10 @@ public class Regression implements DataFrameAnalysis { return lossFunctionParameter; } + public List getFeatureProcessors() { + return featureProcessors; + } + @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); @@ -212,6 +228,9 @@ public class Regression implements DataFrameAnalysis { if (lossFunctionParameter != null) { builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter); } + if (featureProcessors != null) { + NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors); + } builder.endObject(); return builder; } @@ -219,7 +238,7 @@ public class Regression implements DataFrameAnalysis { @Override public int hashCode() { return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues, - predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter); + predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, featureProcessors); } @Override @@ -238,7 +257,8 @@ public class Regression implements DataFrameAnalysis { && Objects.equals(trainingPercent, that.trainingPercent) && Objects.equals(randomizeSeed, that.randomizeSeed) && Objects.equals(lossFunction, that.lossFunction) - && Objects.equals(lossFunctionParameter, that.lossFunctionParameter); + && Objects.equals(lossFunctionParameter, that.lossFunctionParameter) + && Objects.equals(featureProcessors, that.featureProcessors); } @Override @@ -259,6 +279,7 @@ public class Regression implements DataFrameAnalysis { private Long randomizeSeed; private LossFunction lossFunction; private Double lossFunctionParameter; + private List featureProcessors; private Builder(String dependentVariable) { this.dependentVariable = Objects.requireNonNull(dependentVariable); @@ -319,9 +340,15 @@ public class Regression implements DataFrameAnalysis { return this; } + public Builder setFeatureProcessors(List featureProcessors) { + this.featureProcessors = featureProcessors; + return this; + } + public Regression build() { return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, - numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter); + numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, + featureProcessors); } } diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java index a2121c510570..ba5d5e146475 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java @@ -114,7 +114,7 @@ public class OneHotEncoding implements PreProcessor { return Objects.hash(field, hotMap, custom); } - public Builder builder(String field) { + public static Builder builder(String field) { return new Builder(field); } diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java index 6ea03396e827..a676e3a5a7f8 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java @@ -179,6 +179,7 @@ import org.elasticsearch.client.ml.inference.TrainedModelDefinition; import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests; import org.elasticsearch.client.ml.inference.TrainedModelInput; import org.elasticsearch.client.ml.inference.TrainedModelStats; +import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding; import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig; import org.elasticsearch.client.ml.inference.trainedmodel.TargetType; import org.elasticsearch.client.ml.job.config.AnalysisConfig; @@ -3003,6 +3004,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase { .setRandomizeSeed(1234L) // <10> .setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11> .setNumTopClasses(1) // <12> + .setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13> + .addOneHot("cat", "cat_column") + .build())) .build(); // end::put-data-frame-analytics-classification @@ -3019,6 +3023,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase { .setRandomizeSeed(1234L) // <10> .setLossFunction(Regression.LossFunction.MSE) // <11> .setLossFunctionParameter(1.0) // <12> + .setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13> + .addOneHot("cat", "cat_column") + .build())) .build(); // end::put-data-frame-analytics-regression diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java index 0970222c513b..30231feb9a78 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java @@ -18,10 +18,20 @@ */ package org.elasticsearch.client.ml.dataframe; +import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider; +import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests; +import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests; +import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.test.AbstractXContentTestCase; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; public class ClassificationTests extends AbstractXContentTestCase { @@ -38,9 +48,20 @@ public class ClassificationTests extends AbstractXContentTestCase randomFrom(FrequencyEncodingTests.createRandom(), + OneHotEncodingTests.createRandom(), + TargetMeanEncodingTests.createRandom())) + .limit(randomIntBetween(1, 10)) + .collect(Collectors.toList())) .build(); } + @Override + protected Predicate getRandomFieldsExcludeFilter() { + return field -> field.startsWith("feature_processors"); + } + @Override protected Classification createTestInstance() { return randomClassification(); @@ -55,4 +76,11 @@ public class ClassificationTests extends AbstractXContentTestCase namedXContent = new ArrayList<>(); + namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers()); + return new NamedXContentRegistry(namedXContent); + } } diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java index 623e7a98cc88..4d387c800b5e 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.client.ml.dataframe; import org.elasticsearch.Version; +import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeValue; @@ -101,6 +102,7 @@ public class DataFrameAnalyticsConfigTests extends AbstractXContentTestCase namedXContent = new ArrayList<>(); namedXContent.addAll(new SearchModule(Settings.EMPTY, Collections.emptyList()).getNamedXContents()); namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers()); + namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers()); return new NamedXContentRegistry(namedXContent); } } diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java index fca2f54b8a0a..24425e354721 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java @@ -18,10 +18,20 @@ */ package org.elasticsearch.client.ml.dataframe; +import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider; +import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests; +import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests; +import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.test.AbstractXContentTestCase; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; public class RegressionTests extends AbstractXContentTestCase { @@ -37,9 +47,20 @@ public class RegressionTests extends AbstractXContentTestCase { .setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true)) .setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values())) .setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true)) + .setFeatureProcessors(randomBoolean() ? null : + Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(), + OneHotEncodingTests.createRandom(), + TargetMeanEncodingTests.createRandom())) + .limit(randomIntBetween(1, 10)) + .collect(Collectors.toList())) .build(); } + @Override + protected Predicate getRandomFieldsExcludeFilter() { + return field -> field.startsWith("feature_processors"); + } + @Override protected Regression createTestInstance() { return randomRegression(); @@ -54,4 +75,11 @@ public class RegressionTests extends AbstractXContentTestCase { protected boolean supportsUnknownFields() { return true; } + + @Override + protected NamedXContentRegistry xContentRegistry() { + List namedXContent = new ArrayList<>(); + namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers()); + return new NamedXContentRegistry(namedXContent); + } } diff --git a/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc b/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc index 8221dff43bbd..109aecc5a14a 100644 --- a/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc +++ b/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc @@ -124,6 +124,8 @@ include-tagged::{doc-tests-file}[{api}-classification] <10> The seed to be used by the random generator that picks which rows are used in training. <11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall. <12> The number of top classes to be reported in the results. Defaults to 2. +<13> Custom feature processors that will create new features for analysis from the included document + fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features. ===== Regression @@ -146,6 +148,8 @@ include-tagged::{doc-tests-file}[{api}-regression] <10> The seed to be used by the random generator that picks which rows are used in training. <11> The loss function used for regression. Defaults to `mse`. <12> An optional parameter to the loss function. +<13> Custom feature processors that will create new features for analysis from the included document +fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features. ==== Analyzed fields diff --git a/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc b/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc index 7dbb99d3d2bb..ddc2ba673113 100644 --- a/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc +++ b/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc @@ -20,13 +20,13 @@ experimental[] [[ml-put-dfanalytics-prereq]] == {api-prereq-title} -If the {es} {security-features} are enabled, you must have the following +If the {es} {security-features} are enabled, you must have the following built-in roles and privileges: * `machine_learning_admin` * source indices: `read`, `view_index_metadata` * destination index: `read`, `create_index`, `manage` and `index` - + For more information, see <>, <>, and {ml-docs-setup-privileges}. @@ -34,13 +34,13 @@ For more information, see <>, <>, and NOTE: The {dfanalytics-job} remembers which roles the user who created it had at the time of creation. When you start the job, it performs the analysis using those same roles. If you provide -<>, +<>, those credentials are used instead. [[ml-put-dfanalytics-desc]] == {api-description-title} -This API creates a {dfanalytics-job} that performs an analysis on the source +This API creates a {dfanalytics-job} that performs an analysis on the source indices and stores the outcome in a destination index. If the destination index does not exist, it is created automatically when you @@ -48,7 +48,7 @@ start the job. See <>. If you supply only a subset of the {regression} or {classification} parameters, {ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It -determines a value for each of the undefined parameters. +determines a value for each of the undefined parameters. [[ml-put-dfanalytics-path-params]] @@ -63,8 +63,8 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-data-frame-analytics-def == {api-request-body-title} `allow_lazy_start`:: -(Optional, boolean) -Specifies whether this job can start when there is insufficient {ml} node +(Optional, boolean) +Specifies whether this job can start when there is insufficient {ml} node capacity for it to be immediately assigned to a node. The default is `false`; if a {ml} node with capacity to run the job cannot immediately be found, the API returns an error. However, this is also subject to the cluster-wide @@ -88,7 +88,7 @@ one of the following types of analysis: {classification}, {oldetection}, or The configuration information necessary to perform {ml-docs}/dfa-classification.html[{classification}]. + -TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set +TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set automatically by hyperparameter optimization to give the minimum validation error. It is highly recommended to use the default values unless you fully understand the function of these parameters. @@ -105,28 +105,32 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=class-assignment-objective] + include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dependent-variable] + -The data type of the field must be numeric (`integer`, `short`, `long`, `byte`), +The data type of the field must be numeric (`integer`, `short`, `long`, `byte`), categorical (`ip` or `keyword`), or boolean. There must be no more than 30 -different values in this field. +different values in this field. `eta`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta] `feature_bag_fraction`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction] +`feature_processors`:::: +(Optional, list) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors] + `gamma`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma] `lambda`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda] `max_trees`:::: -(Optional, integer) +(Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees] `num_top_classes`:::: @@ -138,11 +142,11 @@ categories, the API reports all category probabilities. Defaults to 2. `num_top_feature_importance_values`:::: (Optional, integer) Advanced configuration option. Specifies the maximum number of -{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. +{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. By default, it is zero and no {feat-imp} calculation occurs. `prediction_field_name`:::: -(Optional, string) +(Optional, string) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name] `randomize_seed`:::: @@ -164,27 +168,27 @@ The configuration information necessary to perform [%collapsible%open] ===== `compute_feature_influence`:::: -(Optional, boolean) +(Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence] - -`feature_influence_threshold`:::: -(Optional, double) + +`feature_influence_threshold`:::: +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold] `method`:::: (Optional, string) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method] - + `n_neighbors`:::: (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors] - + `outlier_fraction`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction] - + `standardization_enabled`:::: -(Optional, boolean) +(Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled] //End outlier_detection ===== @@ -194,7 +198,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled] The configuration information necessary to perform {ml-docs}/dfa-regression.html[{regression}]. + -TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set +TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set automatically by hyperparameter optimization to give the minimum validation error. It is highly recommended to use the default values unless you fully understand the function of these parameters. @@ -217,20 +221,24 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta] (Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction] +`feature_processors`:::: +(Optional, list) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors] + `gamma`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma] `lambda`:::: -(Optional, double) +(Optional, double) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda] `loss_function`:::: (Optional, string) -The loss function used during {regression}. Available options are `mse` (mean -squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber -loss). Defaults to `mse`. Refer to -{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses] +The loss function used during {regression}. Available options are `mse` (mean +squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber +loss). Defaults to `mse`. Refer to +{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses] to learn more. `loss_function_parameter`:::: @@ -238,13 +246,13 @@ to learn more. A positive number that is used as a parameter to the `loss_function`. `max_trees`:::: -(Optional, integer) +(Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees] `num_top_feature_importance_values`:::: (Optional, integer) Advanced configuration option. Specifies the maximum number of -{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. +{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. By default, it is zero and no {feat-imp} calculation occurs. `prediction_field_name`:::: @@ -266,31 +274,31 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=training-percent] //Begin analyzed_fields `analyzed_fields`:: (Optional, object) -Specify `includes` and/or `excludes` patterns to select which fields will be -included in the analysis. The patterns specified in `excludes` are applied last, -therefore `excludes` takes precedence. In other words, if the same field is -specified in both `includes` and `excludes`, then the field will not be included +Specify `includes` and/or `excludes` patterns to select which fields will be +included in the analysis. The patterns specified in `excludes` are applied last, +therefore `excludes` takes precedence. In other words, if the same field is +specified in both `includes` and `excludes`, then the field will not be included in the analysis. + -- [[dfa-supported-fields]] The supported fields for each type of analysis are as follows: -* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms -don't support missing values therefore fields that have data types other than -numeric or boolean are ignored. Documents where included fields contain missing -values, null values, or an array are also ignored. Therefore the `dest` index +* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms +don't support missing values therefore fields that have data types other than +numeric or boolean are ignored. Documents where included fields contain missing +values, null values, or an array are also ignored. Therefore the `dest` index may contain documents that don't have an {olscore}. -* {regression-cap} supports fields that are numeric, `boolean`, `text`, -`keyword`, and `ip`. It is also tolerant of missing values. Fields that are -supported are included in the analysis, other fields are ignored. Documents -where included fields contain an array with two or more values are also -ignored. Documents in the `dest` index that don’t contain a results field are +* {regression-cap} supports fields that are numeric, `boolean`, `text`, +`keyword`, and `ip`. It is also tolerant of missing values. Fields that are +supported are included in the analysis, other fields are ignored. Documents +where included fields contain an array with two or more values are also +ignored. Documents in the `dest` index that don’t contain a results field are not included in the {reganalysis}. * {classification-cap} supports fields that are numeric, `boolean`, `text`, -`keyword`, and `ip`. It is also tolerant of missing values. Fields that are +`keyword`, and `ip`. It is also tolerant of missing values. Fields that are supported are included in the analysis, other fields are ignored. Documents -where included fields contain an array with two or more values are also ignored. +where included fields contain an array with two or more values are also ignored. Documents in the `dest` index that don’t contain a results field are not included in the {classanalysis}. {classanalysis-cap} can be improved by mapping ordinal variable values to a single number. For example, in case of age ranges, @@ -312,7 +320,7 @@ analysis. You do not need to add fields with unsupported data types to `includes`::: (Optional, array) -An array of strings that defines the fields that will be included in the +An array of strings that defines the fields that will be included in the analysis. //End analyzed_fields ==== @@ -332,16 +340,16 @@ The default value is `1`. Using more threads may decrease the time necessary to complete the analysis at the cost of using more CPU. Note that the process may use additional threads for operational functionality other than the analysis itself. - + `model_memory_limit`:: (Optional, string) -The approximate maximum amount of memory resources that are permitted for -analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If -your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit` -setting, an error occurs when you try to create {dfanalytics-jobs} that have -`model_memory_limit` values greater than that setting. For more information, see +The approximate maximum amount of memory resources that are permitted for +analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If +your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit` +setting, an error occurs when you try to create {dfanalytics-jobs} that have +`model_memory_limit` values greater than that setting. For more information, see <>. - + `source`:: (object) The configuration of how to source the analysis data. It requires an `index`. @@ -355,7 +363,7 @@ Optionally, `query` and `_source` may be specified. It can be a single index or index pattern as well as an array of indices or patterns. + -WARNING: If your source indices contain documents with the same IDs, only the +WARNING: If your source indices contain documents with the same IDs, only the document that is indexed last appears in the destination index. `query`::: @@ -376,7 +384,7 @@ included in the analysis. `includes`:::: (array) An array of strings that defines the fields that will be included in the destination. - + `excludes`:::: (array) An array of strings that defines the fields that will be excluded from the destination. @@ -389,8 +397,8 @@ the destination. [[ml-put-dfanalytics-example-preprocess]] === Preprocessing actions example -The following example shows how to limit the scope of the analysis to certain -fields, specify excluded fields in the destination index, and use a query to +The following example shows how to limit the scope of the analysis to certain +fields, specify excluded fields in the destination index, and use a query to filter your data before analysis. [source,console] @@ -403,7 +411,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre ], "query": { <2> "range": { - "DistanceKilometers": { + "DistanceKilometers": { "gt": 0 } } @@ -428,7 +436,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre }, "analyzed_fields": { <5> "includes": [], - "excludes": [ + "excludes": [ "FlightNum" ] }, @@ -438,29 +446,29 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre // TEST[skip:setup kibana sample data] <1> Source index to analyze. -<2> This query filters out entire documents that will not be present in the +<2> This query filters out entire documents that will not be present in the destination index. -<3> The `_source` object defines fields in the dataset that will be included or -excluded in the destination index. -<4> Defines the destination index that contains the results of the analysis and -the fields of the source index specified in the `_source` object. Also defines +<3> The `_source` object defines fields in the data set that will be included or +excluded in the destination index. +<4> Defines the destination index that contains the results of the analysis and +the fields of the source index specified in the `_source` object. Also defines the name of the `results_field`. -<5> Specifies fields to be included in or excluded from the analysis. This does -not affect whether the fields will be present in the destination index, only +<5> Specifies fields to be included in or excluded from the analysis. This does +not affect whether the fields will be present in the destination index, only affects whether they are used in the analysis. -In this example, we can see that all the fields of the source index are included -in the destination index except `FlightDelay` and `FlightDelayType` because -these are defined as excluded fields by the `excludes` parameter of the -`_source` object. The `FlightNum` field is included in the destination index, -however it is not included in the analysis because it is explicitly specified as +In this example, we can see that all the fields of the source index are included +in the destination index except `FlightDelay` and `FlightDelayType` because +these are defined as excluded fields by the `excludes` parameter of the +`_source` object. The `FlightNum` field is included in the destination index, +however it is not included in the analysis because it is explicitly specified as excluded field by the `excludes` parameter of the `analyzed_fields` object. [[ml-put-dfanalytics-example-od]] === {oldetection-cap} example -The following example creates the `loganalytics` {dfanalytics-job}, the analysis +The following example creates the `loganalytics` {dfanalytics-job}, the analysis type is `outlier_detection`: [source,console] @@ -524,7 +532,7 @@ The API returns the following result: [[ml-put-dfanalytics-example-r]] === {regression-cap} examples -The following example creates the `house_price_regression_analysis` +The following example creates the `house_price_regression_analysis` {dfanalytics-job}, the analysis type is `regression`: [source,console] @@ -537,7 +545,7 @@ PUT _ml/data_frame/analytics/house_price_regression_analysis "dest": { "index": "house_price_predictions" }, - "analysis": + "analysis": { "regression": { "dependent_variable": "price" @@ -613,7 +621,7 @@ PUT _ml/data_frame/analytics/student_performance_mathematics_0.3 [[ml-put-dfanalytics-example-c]] === {classification-cap} example -The following example creates the `loan_classification` {dfanalytics-job}, the +The following example creates the `loan_classification` {dfanalytics-job}, the analysis type is `classification`: [source,console] diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc index 88de83639944..2a3dea8aca4d 100644 --- a/docs/reference/ml/ml-shared.asciidoc +++ b/docs/reference/ml/ml-shared.asciidoc @@ -522,6 +522,14 @@ The fraction of features that is used when selecting a random bag for each candidate split. end::dfas-feature-bag-fraction[] +tag::dfas-feature-processors[] +A collection of feature preprocessors that modify one or more included fields. +The analysis uses the resulting one or more features instead of the +original document field. Multiple `feature_processors` entries can refer to the +same document fields. +Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs. +end::dfas-feature-processors[] + tag::dfas-iteration[] The number of iterations on the analysis. end::dfas-iteration[]