From 1b34c88d56c69590a7b954e5ea067ec4e77cb3c2 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Mon, 24 Aug 2020 12:00:44 -0400
Subject: [PATCH] [ML] adding docs + hlrc for data frame analysis
 feature_processors (#61149)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds HLRC and some docs for the new feature_processors field in Data frame analytics.

Co-authored-by: Przemysław Witek <przemyslaw.witek@elastic.co>
Co-authored-by: Lisa Cawley <lcawley@elastic.co>
---
 .../client/ml/dataframe/Classification.java   |  36 +++-
 .../client/ml/dataframe/Regression.java       |  37 +++-
 .../preprocessing/OneHotEncoding.java         |   2 +-
 .../MlClientDocumentationIT.java              |   7 +
 .../ml/dataframe/ClassificationTests.java     |  28 +++
 .../DataFrameAnalyticsConfigTests.java        |   2 +
 .../client/ml/dataframe/RegressionTests.java  |  28 +++
 .../ml/put-data-frame-analytics.asciidoc      |   4 +
 .../apis/put-dfanalytics.asciidoc             | 168 +++++++++---------
 docs/reference/ml/ml-shared.asciidoc          |   8 +
 10 files changed, 229 insertions(+), 91 deletions(-)
diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java
index 3d9241d6ec95..d5639bda9bc1 100644
--- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java
+++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java
@@ -18,6 +18,8 @@
  */
 package org.elasticsearch.client.ml.dataframe;
 
+import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
+import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
@@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
 
 import java.io.IOException;
+import java.util.List;
 import java.util.Locale;
 import java.util.Objects;
 
@@ -53,7 +56,9 @@ public class Classification implements DataFrameAnalysis {
     static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective");
     static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes");
     static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
+    static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
 
+    @SuppressWarnings("unchecked")
     private static final ConstructingObjectParser<Classification, Void> PARSER =
         new ConstructingObjectParser<>(
             NAME.getPreferredName(),
@@ -70,7 +75,8 @@ public class Classification implements DataFrameAnalysis {
                 (Double) a[8],
                 (Integer) a[9],
                 (Long) a[10],
-                (ClassAssignmentObjective) a[11]));
+                (ClassAssignmentObjective) a[11],
+                (List<PreProcessor>) a[12]));
 
     static {
         PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE);
@@ -86,6 +92,10 @@ public class Classification implements DataFrameAnalysis {
         PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
         PARSER.declareString(
             ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE);
+        PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
+            (p, c, n) -> p.namedObject(PreProcessor.class, n, c),
+            (classification) -> {},
+            FEATURE_PROCESSORS);
     }
 
     private final String dependentVariable;
@@ -100,12 +110,13 @@ public class Classification implements DataFrameAnalysis {
     private final ClassAssignmentObjective classAssignmentObjective;
     private final Integer numTopClasses;
     private final Long randomizeSeed;
+    private final List<PreProcessor> featureProcessors;
 
     private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
                            @Nullable Integer maxTrees, @Nullable Double featureBagFraction,
                            @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
                            @Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed,
-                           @Nullable ClassAssignmentObjective classAssignmentObjective) {
+                           @Nullable ClassAssignmentObjective classAssignmentObjective, @Nullable List<PreProcessor> featureProcessors) {
         this.dependentVariable = Objects.requireNonNull(dependentVariable);
         this.lambda = lambda;
         this.gamma = gamma;
@@ -118,6 +129,7 @@ public class Classification implements DataFrameAnalysis {
         this.classAssignmentObjective = classAssignmentObjective;
         this.numTopClasses = numTopClasses;
         this.randomizeSeed = randomizeSeed;
+        this.featureProcessors = featureProcessors;
     }
 
     @Override
@@ -173,6 +185,10 @@ public class Classification implements DataFrameAnalysis {
         return numTopClasses;
     }
 
+    public List<PreProcessor> getFeatureProcessors() {
+        return featureProcessors;
+    }
+
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
@@ -210,6 +226,9 @@ public class Classification implements DataFrameAnalysis {
         if (numTopClasses != null) {
             builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses);
         }
+        if (featureProcessors != null) {
+            NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
+        }
         builder.endObject();
         return builder;
     }
@@ -217,7 +236,7 @@ public class Classification implements DataFrameAnalysis {
     @Override
     public int hashCode() {
         return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
-            predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective);
+            predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective, featureProcessors);
     }
 
     @Override
@@ -236,7 +255,8 @@ public class Classification implements DataFrameAnalysis {
             && Objects.equals(trainingPercent, that.trainingPercent)
             && Objects.equals(randomizeSeed, that.randomizeSeed)
             && Objects.equals(numTopClasses, that.numTopClasses)
-            && Objects.equals(classAssignmentObjective, that.classAssignmentObjective);
+            && Objects.equals(classAssignmentObjective, that.classAssignmentObjective)
+            && Objects.equals(featureProcessors, that.featureProcessors);
     }
 
     @Override
@@ -270,6 +290,7 @@ public class Classification implements DataFrameAnalysis {
         private Integer numTopClasses;
         private Long randomizeSeed;
         private ClassAssignmentObjective classAssignmentObjective;
+        private List<PreProcessor> featureProcessors;
 
         private Builder(String dependentVariable) {
             this.dependentVariable = Objects.requireNonNull(dependentVariable);
@@ -330,10 +351,15 @@ public class Classification implements DataFrameAnalysis {
             return this;
         }
 
+        public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
+            this.featureProcessors = featureProcessors;
+            return this;
+        }
+
         public Classification build() {
             return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
                 numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed,
-                classAssignmentObjective);
+                classAssignmentObjective, featureProcessors);
         }
     }
 }
diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java
index e897dde198e0..6e9f89700139 100644
--- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java
+++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java
@@ -18,6 +18,8 @@
  */
 package org.elasticsearch.client.ml.dataframe;
 
+import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
+import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
@@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
 
 import java.io.IOException;
+import java.util.List;
 import java.util.Locale;
 import java.util.Objects;
 
@@ -55,7 +58,9 @@ public class Regression implements DataFrameAnalysis {
     static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
     static final ParseField LOSS_FUNCTION = new ParseField("loss_function");
     static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter");
+    static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
 
+    @SuppressWarnings("unchecked")
     private static final ConstructingObjectParser<Regression, Void> PARSER =
         new ConstructingObjectParser<>(
             NAME.getPreferredName(),
@@ -72,7 +77,8 @@ public class Regression implements DataFrameAnalysis {
                 (Double) a[8],
                 (Long) a[9],
                 (LossFunction) a[10],
-                (Double) a[11]
+                (Double) a[11],
+                (List<PreProcessor>) a[12]
             ));
 
     static {
@@ -88,6 +94,10 @@ public class Regression implements DataFrameAnalysis {
         PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
         PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION);
         PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER);
+        PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
+            (p, c, n) -> p.namedObject(PreProcessor.class, n, c),
+            (regression) -> {},
+            FEATURE_PROCESSORS);
     }
 
     private final String dependentVariable;
@@ -102,12 +112,13 @@ public class Regression implements DataFrameAnalysis {
     private final Long randomizeSeed;
     private final LossFunction lossFunction;
     private final Double lossFunctionParameter;
+    private final List<PreProcessor> featureProcessors;
 
     private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
                        @Nullable Integer maxTrees, @Nullable Double featureBagFraction,
                        @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
                        @Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction,
-                       @Nullable Double lossFunctionParameter) {
+                       @Nullable Double lossFunctionParameter, @Nullable List<PreProcessor> featureProcessors) {
         this.dependentVariable = Objects.requireNonNull(dependentVariable);
         this.lambda = lambda;
         this.gamma = gamma;
@@ -120,6 +131,7 @@ public class Regression implements DataFrameAnalysis {
         this.randomizeSeed = randomizeSeed;
         this.lossFunction = lossFunction;
         this.lossFunctionParameter = lossFunctionParameter;
+        this.featureProcessors = featureProcessors;
     }
 
     @Override
@@ -175,6 +187,10 @@ public class Regression implements DataFrameAnalysis {
         return lossFunctionParameter;
     }
 
+    public List<PreProcessor> getFeatureProcessors() {
+        return featureProcessors;
+    }
+
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
@@ -212,6 +228,9 @@ public class Regression implements DataFrameAnalysis {
         if (lossFunctionParameter != null) {
             builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter);
         }
+        if (featureProcessors != null) {
+            NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
+        }
         builder.endObject();
         return builder;
     }
@@ -219,7 +238,7 @@ public class Regression implements DataFrameAnalysis {
     @Override
     public int hashCode() {
         return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
-            predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
+            predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, featureProcessors);
     }
 
     @Override
@@ -238,7 +257,8 @@ public class Regression implements DataFrameAnalysis {
             && Objects.equals(trainingPercent, that.trainingPercent)
             && Objects.equals(randomizeSeed, that.randomizeSeed)
             && Objects.equals(lossFunction, that.lossFunction)
-            && Objects.equals(lossFunctionParameter, that.lossFunctionParameter);
+            && Objects.equals(lossFunctionParameter, that.lossFunctionParameter)
+            && Objects.equals(featureProcessors, that.featureProcessors);
     }
 
     @Override
@@ -259,6 +279,7 @@ public class Regression implements DataFrameAnalysis {
         private Long randomizeSeed;
         private LossFunction lossFunction;
         private Double lossFunctionParameter;
+        private List<PreProcessor> featureProcessors;
 
         private Builder(String dependentVariable) {
             this.dependentVariable = Objects.requireNonNull(dependentVariable);
@@ -319,9 +340,15 @@ public class Regression implements DataFrameAnalysis {
             return this;
         }
 
+        public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
+            this.featureProcessors = featureProcessors;
+            return this;
+        }
+
         public Regression build() {
             return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
-                numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
+                numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter,
+                featureProcessors);
         }
     }
 
diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java
index a2121c510570..ba5d5e146475 100644
--- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java
+++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java
@@ -114,7 +114,7 @@ public class OneHotEncoding implements PreProcessor {
         return Objects.hash(field, hotMap, custom);
     }
 
-    public Builder builder(String field) {
+    public static Builder builder(String field) {
         return new Builder(field);
     }
 
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
index 6ea03396e827..a676e3a5a7f8 100644
--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
@@ -179,6 +179,7 @@ import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
 import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests;
 import org.elasticsearch.client.ml.inference.TrainedModelInput;
 import org.elasticsearch.client.ml.inference.TrainedModelStats;
+import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding;
 import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig;
 import org.elasticsearch.client.ml.inference.trainedmodel.TargetType;
 import org.elasticsearch.client.ml.job.config.AnalysisConfig;
@@ -3003,6 +3004,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
                 .setRandomizeSeed(1234L) // <10>
                 .setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11>
                 .setNumTopClasses(1) // <12>
+                .setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
+                    .addOneHot("cat", "cat_column")
+                    .build()))
                 .build();
             // end::put-data-frame-analytics-classification
 
@@ -3019,6 +3023,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
                 .setRandomizeSeed(1234L) // <10>
                 .setLossFunction(Regression.LossFunction.MSE) // <11>
                 .setLossFunctionParameter(1.0) // <12>
+                .setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
+                    .addOneHot("cat", "cat_column")
+                    .build()))
                 .build();
             // end::put-data-frame-analytics-regression
 
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java
index 0970222c513b..30231feb9a78 100644
--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java
@@ -18,10 +18,20 @@
  */
 package org.elasticsearch.client.ml.dataframe;
 
+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
+import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.test.AbstractXContentTestCase;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 public class ClassificationTests extends AbstractXContentTestCase<Classification> {
 
@@ -38,9 +48,20 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
             .setRandomizeSeed(randomBoolean() ? null : randomLong())
             .setClassAssignmentObjective(randomBoolean() ? null : randomFrom(Classification.ClassAssignmentObjective.values()))
             .setNumTopClasses(randomBoolean() ? null : randomIntBetween(0, 10))
+            .setFeatureProcessors(randomBoolean() ? null :
+                Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
+                    OneHotEncodingTests.createRandom(),
+                    TargetMeanEncodingTests.createRandom()))
+                    .limit(randomIntBetween(1, 10))
+                    .collect(Collectors.toList()))
             .build();
     }
 
+    @Override
+    protected Predicate<String> getRandomFieldsExcludeFilter() {
+        return field -> field.startsWith("feature_processors");
+    }
+
     @Override
     protected Classification createTestInstance() {
         return randomClassification();
@@ -55,4 +76,11 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
     protected boolean supportsUnknownFields() {
         return true;
     }
+
+    @Override
+    protected NamedXContentRegistry xContentRegistry() {
+        List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
+        namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
+        return new NamedXContentRegistry(namedXContent);
+    }
 }
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java
index 623e7a98cc88..4d387c800b5e 100644
--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java
@@ -20,6 +20,7 @@
 package org.elasticsearch.client.ml.dataframe;
 
 import org.elasticsearch.Version;
+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
@@ -101,6 +102,7 @@ public class DataFrameAnalyticsConfigTests extends AbstractXContentTestCase<Data
         List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
         namedXContent.addAll(new SearchModule(Settings.EMPTY, Collections.emptyList()).getNamedXContents());
         namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers());
+        namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
         return new NamedXContentRegistry(namedXContent);
     }
 }
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java
index fca2f54b8a0a..24425e354721 100644
--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java
@@ -18,10 +18,20 @@
  */
 package org.elasticsearch.client.ml.dataframe;
 
+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
+import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.test.AbstractXContentTestCase;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 public class RegressionTests extends AbstractXContentTestCase<Regression> {
 
@@ -37,9 +47,20 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
             .setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true))
             .setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values()))
             .setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true))
+            .setFeatureProcessors(randomBoolean() ? null :
+                Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
+                    OneHotEncodingTests.createRandom(),
+                    TargetMeanEncodingTests.createRandom()))
+                    .limit(randomIntBetween(1, 10))
+                    .collect(Collectors.toList()))
             .build();
     }
 
+    @Override
+    protected Predicate<String> getRandomFieldsExcludeFilter() {
+        return field -> field.startsWith("feature_processors");
+    }
+
     @Override
     protected Regression createTestInstance() {
         return randomRegression();
@@ -54,4 +75,11 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
     protected boolean supportsUnknownFields() {
         return true;
     }
+
+    @Override
+    protected NamedXContentRegistry xContentRegistry() {
+        List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
+        namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
+        return new NamedXContentRegistry(namedXContent);
+    }
 }
diff --git a/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc b/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc
index 8221dff43bbd..109aecc5a14a 100644
--- a/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc
+++ b/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc
@@ -124,6 +124,8 @@ include-tagged::{doc-tests-file}[{api}-classification]
 <10> The seed to be used by the random generator that picks which rows are used in training.
 <11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall.
 <12> The number of top classes to be reported in the results. Defaults to 2.
+<13> Custom feature processors that will create new features for analysis from the included document
+     fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
 
 ===== Regression
 
@@ -146,6 +148,8 @@ include-tagged::{doc-tests-file}[{api}-regression]
 <10> The seed to be used by the random generator that picks which rows are used in training.
 <11> The loss function used for regression. Defaults to `mse`.
 <12> An optional parameter to the loss function.
+<13> Custom feature processors that will create new features for analysis from the included document
+fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
 
 ==== Analyzed fields
 
diff --git a/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc b/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
index 7dbb99d3d2bb..ddc2ba673113 100644
--- a/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
+++ b/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
@@ -20,13 +20,13 @@ experimental[]
 [[ml-put-dfanalytics-prereq]]
 == {api-prereq-title}
 
-If the {es} {security-features} are enabled, you must have the following 
+If the {es} {security-features} are enabled, you must have the following
 built-in roles and privileges:
 
 * `machine_learning_admin`
 * source indices: `read`, `view_index_metadata`
 * destination index: `read`, `create_index`, `manage` and `index`
-  
+
 For more information, see <<built-in-roles>>, <<security-privileges>>, and
 {ml-docs-setup-privileges}.
 
@@ -34,13 +34,13 @@ For more information, see <<built-in-roles>>, <<security-privileges>>, and
 NOTE: The {dfanalytics-job} remembers which roles the user who created it had at
 the time of creation. When you start the job, it performs the analysis using
 those same roles. If you provide
-<<http-clients-secondary-authorization,secondary authorization headers>>, 
+<<http-clients-secondary-authorization,secondary authorization headers>>,
 those credentials are used instead.
 
 [[ml-put-dfanalytics-desc]]
 == {api-description-title}
 
-This API creates a {dfanalytics-job} that performs an analysis on the source 
+This API creates a {dfanalytics-job} that performs an analysis on the source
 indices and stores the outcome in a destination index.
 
 If the destination index does not exist, it is created automatically when you
@@ -48,7 +48,7 @@ start the job. See <<start-dfanalytics>>.
 
 If you supply only a subset of the {regression} or {classification} parameters,
 {ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
-determines a value for each of the undefined parameters. 
+determines a value for each of the undefined parameters.
 
 
 [[ml-put-dfanalytics-path-params]]
@@ -63,8 +63,8 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-data-frame-analytics-def
 == {api-request-body-title}
 
 `allow_lazy_start`::
-(Optional, boolean) 
-Specifies whether this job can start when there is insufficient {ml} node 
+(Optional, boolean)
+Specifies whether this job can start when there is insufficient {ml} node
 capacity for it to be immediately assigned to a node. The default is `false`; if
 a {ml} node with capacity to run the job cannot immediately be found, the API
 returns an error. However, this is also subject to the cluster-wide
@@ -88,7 +88,7 @@ one of the following types of analysis: {classification}, {oldetection}, or
 The configuration information necessary to perform
 {ml-docs}/dfa-classification.html[{classification}].
 +
-TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set 
+TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
 automatically by hyperparameter optimization to give the minimum validation
 error. It is highly recommended to use the default values unless you fully
 understand the function of these parameters.
@@ -105,28 +105,32 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=class-assignment-objective]
 +
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dependent-variable]
 +
-The data type of the field must be numeric (`integer`, `short`, `long`, `byte`), 
+The data type of the field must be numeric (`integer`, `short`, `long`, `byte`),
 categorical (`ip` or `keyword`), or boolean. There must be no more than 30
-different values in this field. 
+different values in this field.
 
 `eta`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
 
 `feature_bag_fraction`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
 
+`feature_processors`::::
+(Optional, list)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
+
 `gamma`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
 
 `lambda`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
 
 `max_trees`::::
-(Optional, integer) 
+(Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
 
 `num_top_classes`::::
@@ -138,11 +142,11 @@ categories, the API reports all category probabilities. Defaults to 2.
 `num_top_feature_importance_values`::::
 (Optional, integer)
 Advanced configuration option. Specifies the maximum number of
-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. 
+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
 By default, it is zero and no {feat-imp} calculation occurs.
 
 `prediction_field_name`::::
-(Optional, string) 
+(Optional, string)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name]
 
 `randomize_seed`::::
@@ -164,27 +168,27 @@ The configuration information necessary to perform
 [%collapsible%open]
 =====
 `compute_feature_influence`::::
-(Optional, boolean) 
+(Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence]
-  
-`feature_influence_threshold`:::: 
-(Optional, double) 
+
+`feature_influence_threshold`::::
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold]
 
 `method`::::
 (Optional, string)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method]
-  
+
 `n_neighbors`::::
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors]
-  
+
 `outlier_fraction`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction]
-  
+
 `standardization_enabled`::::
-(Optional, boolean) 
+(Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
 //End outlier_detection
 =====
@@ -194,7 +198,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
 The configuration information necessary to perform
 {ml-docs}/dfa-regression.html[{regression}].
 +
-TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set 
+TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
 automatically by hyperparameter optimization to give the minimum validation
 error. It is highly recommended to use the default values unless you fully
 understand the function of these parameters.
@@ -217,20 +221,24 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
 (Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
 
+`feature_processors`::::
+(Optional, list)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
+
 `gamma`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
 
 `lambda`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
 
 `loss_function`::::
 (Optional, string)
-The loss function used during {regression}. Available options are `mse` (mean 
-squared error), `msle` (mean squared logarithmic error),  `huber` (Pseudo-Huber 
-loss). Defaults to `mse`. Refer to 
-{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses] 
+The loss function used during {regression}. Available options are `mse` (mean
+squared error), `msle` (mean squared logarithmic error),  `huber` (Pseudo-Huber
+loss). Defaults to `mse`. Refer to
+{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
 to learn more.
 
 `loss_function_parameter`::::
@@ -238,13 +246,13 @@ to learn more.
 A positive number that is used as a parameter to the `loss_function`.
 
 `max_trees`::::
-(Optional, integer) 
+(Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
 
 `num_top_feature_importance_values`::::
 (Optional, integer)
 Advanced configuration option. Specifies the maximum number of
-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. 
+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
 By default, it is zero and no {feat-imp} calculation occurs.
 
 `prediction_field_name`::::
@@ -266,31 +274,31 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=training-percent]
 //Begin analyzed_fields
 `analyzed_fields`::
 (Optional, object)
-Specify `includes` and/or `excludes` patterns to select which fields will be 
-included in the analysis. The patterns specified in `excludes` are applied last, 
-therefore `excludes` takes precedence. In other words, if the same field is 
-specified in both `includes` and `excludes`, then the field will not be included 
+Specify `includes` and/or `excludes` patterns to select which fields will be
+included in the analysis. The patterns specified in `excludes` are applied last,
+therefore `excludes` takes precedence. In other words, if the same field is
+specified in both `includes` and `excludes`, then the field will not be included
 in the analysis.
 +
 --
 [[dfa-supported-fields]]
 The supported fields for each type of analysis are as follows:
 
-* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms 
-don't support missing values therefore fields that have data types other than 
-numeric or boolean are ignored. Documents where included fields contain missing 
-values, null values, or an array are also ignored. Therefore the `dest` index 
+* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
+don't support missing values therefore fields that have data types other than
+numeric or boolean are ignored. Documents where included fields contain missing
+values, null values, or an array are also ignored. Therefore the `dest` index
 may contain documents that don't have an {olscore}.
-* {regression-cap} supports fields that are numeric, `boolean`, `text`, 
-`keyword`, and `ip`. It is also tolerant of missing values. Fields that are 
-supported are included in the analysis, other fields are ignored. Documents 
-where included fields contain  an array with two or more values are also 
-ignored. Documents in the `dest` index  that don’t contain a results field are 
+* {regression-cap} supports fields that are numeric, `boolean`, `text`,
+`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
+supported are included in the analysis, other fields are ignored. Documents
+where included fields contain  an array with two or more values are also
+ignored. Documents in the `dest` index  that don’t contain a results field are
 not included in the {reganalysis}.
 * {classification-cap} supports fields that are numeric, `boolean`, `text`,
-`keyword`, and `ip`. It is also tolerant of missing values. Fields that are 
+`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
 supported are included in the analysis, other fields are ignored. Documents
-where included fields contain an array with two or more values are also ignored. 
+where included fields contain an array with two or more values are also ignored.
 Documents in the `dest` index that don’t contain a results field are not
 included in the {classanalysis}. {classanalysis-cap} can be improved by mapping
 ordinal variable values to a  single number. For example, in case of age ranges,
@@ -312,7 +320,7 @@ analysis. You do not need to add fields with unsupported data types to
 
 `includes`:::
 (Optional, array)
-An array of strings that defines the fields that will be included in the 
+An array of strings that defines the fields that will be included in the
 analysis.
 //End analyzed_fields
 ====
@@ -332,16 +340,16 @@ The default value is `1`. Using more threads may decrease the time
 necessary to complete the analysis at the cost of using more CPU.
 Note that the process may use additional threads for operational
 functionality other than the analysis itself.
-  
+
 `model_memory_limit`::
 (Optional, string)
-The approximate maximum amount of memory resources that are permitted for 
-analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If 
-your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit` 
-setting, an error occurs when you try to create {dfanalytics-jobs} that have 
-`model_memory_limit` values greater than that setting. For more information, see 
+The approximate maximum amount of memory resources that are permitted for
+analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
+your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
+setting, an error occurs when you try to create {dfanalytics-jobs} that have
+`model_memory_limit` values greater than that setting. For more information, see
 <<ml-settings>>.
-  
+
 `source`::
 (object)
 The configuration of how to source the analysis data. It requires an `index`.
@@ -355,7 +363,7 @@ Optionally, `query` and `_source` may be specified.
 It can be a single index or index pattern as well as an array of indices or
 patterns.
 +
-WARNING: If your source indices contain documents with the same IDs, only the 
+WARNING: If your source indices contain documents with the same IDs, only the
 document that is indexed last appears in the destination index.
 
 `query`:::
@@ -376,7 +384,7 @@ included in the analysis.
 `includes`::::
 (array) An array of strings that defines the fields that will be included in the
 destination.
-        
+
 `excludes`::::
 (array) An array of strings that defines the fields that will be excluded from
 the destination.
@@ -389,8 +397,8 @@ the destination.
 [[ml-put-dfanalytics-example-preprocess]]
 === Preprocessing actions example
 
-The following example shows how to limit the scope of the analysis to certain 
-fields, specify excluded fields in the destination index, and use a query to 
+The following example shows how to limit the scope of the analysis to certain
+fields, specify excluded fields in the destination index, and use a query to
 filter your data before analysis.
 
 [source,console]
@@ -403,7 +411,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
     ],
     "query": { <2>
       "range": {
-        "DistanceKilometers": { 
+        "DistanceKilometers": {
           "gt": 0
         }
       }
@@ -428,7 +436,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
   },
   "analyzed_fields": { <5>
     "includes": [],
-    "excludes": [   
+    "excludes": [
       "FlightNum"
     ]
   },
@@ -438,29 +446,29 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
 // TEST[skip:setup kibana sample data]
 
 <1> Source index to analyze.
-<2> This query filters out entire documents that will not be present in the 
+<2> This query filters out entire documents that will not be present in the
 destination index.
-<3> The `_source` object defines fields in the dataset that will be included or 
-excluded in the destination index. 
-<4> Defines the destination index that contains the results of the analysis and 
-the fields of the source index specified in the `_source` object. Also defines 
+<3> The `_source` object defines fields in the data set that will be included or
+excluded in the destination index.
+<4> Defines the destination index that contains the results of the analysis and
+the fields of the source index specified in the `_source` object. Also defines
 the name of the `results_field`.
-<5> Specifies fields to be included in or excluded from the analysis. This does 
-not affect whether the fields will be present in the destination index, only 
+<5> Specifies fields to be included in or excluded from the analysis. This does
+not affect whether the fields will be present in the destination index, only
 affects whether they are used in the analysis.
 
-In this example, we can see that all the fields of the source index are included 
-in the destination index except `FlightDelay` and `FlightDelayType` because 
-these are defined as excluded fields by the `excludes` parameter of the 
-`_source` object. The `FlightNum` field is included in the destination index, 
-however it is not included in the analysis because it is explicitly specified as 
+In this example, we can see that all the fields of the source index are included
+in the destination index except `FlightDelay` and `FlightDelayType` because
+these are defined as excluded fields by the `excludes` parameter of the
+`_source` object. The `FlightNum` field is included in the destination index,
+however it is not included in the analysis because it is explicitly specified as
 excluded field by the `excludes` parameter of the `analyzed_fields` object.
 
 
 [[ml-put-dfanalytics-example-od]]
 === {oldetection-cap} example
 
-The following example creates the `loganalytics` {dfanalytics-job}, the analysis 
+The following example creates the `loganalytics` {dfanalytics-job}, the analysis
 type is `outlier_detection`:
 
 [source,console]
@@ -524,7 +532,7 @@ The API returns the following result:
 [[ml-put-dfanalytics-example-r]]
 === {regression-cap} examples
 
-The following example creates the `house_price_regression_analysis` 
+The following example creates the `house_price_regression_analysis`
 {dfanalytics-job}, the analysis type is `regression`:
 
 [source,console]
@@ -537,7 +545,7 @@ PUT _ml/data_frame/analytics/house_price_regression_analysis
   "dest": {
     "index": "house_price_predictions"
   },
-  "analysis": 
+  "analysis":
     {
       "regression": {
         "dependent_variable": "price"
@@ -613,7 +621,7 @@ PUT _ml/data_frame/analytics/student_performance_mathematics_0.3
 [[ml-put-dfanalytics-example-c]]
 === {classification-cap} example
 
-The following example creates the `loan_classification` {dfanalytics-job}, the 
+The following example creates the `loan_classification` {dfanalytics-job}, the
 analysis type is `classification`:
 
 [source,console]
diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
index 88de83639944..2a3dea8aca4d 100644
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -522,6 +522,14 @@ The fraction of features that is used when selecting a random bag for each
 candidate split.
 end::dfas-feature-bag-fraction[]
 
+tag::dfas-feature-processors[]
+A collection of feature preprocessors that modify one or more included fields.
+The analysis uses the resulting one or more features instead of the
+original document field. Multiple `feature_processors` entries can refer to the
+same document fields.
+Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs.
+end::dfas-feature-processors[]
+
 tag::dfas-iteration[]
 The number of iterations on the analysis.
 end::dfas-iteration[]