[Security Solution][Elastic AI Assistant] Adds Model Evaluation Tooling (#167220)

## Summary This PR introduces a new `internal/elastic_assistant/evaluate` route and `Evaluation` Advanced Setting within the Assistant for benchmarking and testing models, agents, and other aspects of the Assistant configuration. Enable via the `assistantModelEvaluation` experimental feature in your `kibana.dev.yml` (and better add `discoverInTimeline` for good measure as well! :) > xpack.securitySolution.enableExperimental: ['assistantModelEvaluation', 'discoverInTimeline'] Then access from within the `Advanced Settings` modal in the Assistant. To use, first select your Connectors/Models, then corresponding Agent configurations, then what model you would like to use for final evaluation, the evaluation type, and if `custom`, you can specify the evaluation prompt that is sent off to the evaluator model. Finally, specify the `dataset`, and `output index` that the results should be written to, then click `Perform evaluation`. Sample datasets can be found in `x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets`, and include: * `esql_dataset.json` * `query_dataset.json` * `security_labs.json` * `security_questions_dataset.json` <p align="center"> <img width="500" src="99f8e764-34bc-4eb7-bbd8-7038ab72117b" /> </p> <p align="center"> <img width="500" src="f48f91dc-45da-4cd6-9dc7-cb88105668b2" /> </p> ### Checklist Delete any items that are not applicable to this PR. - [X] Any text added follows [EUI's writing guidelines](https://elastic.github.io/eui/#/guidelines/writing), uses sentence case text and includes [i18n support](https://github.com/elastic/kibana/blob/main/packages/kbn-i18n/README.md) --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
2025-06-27 10:40:07 -04:00 · 2023-09-29 09:32:24 -06:00 · 2023-09-29 09:32:24 -06:00 · 3ba0f32952
commit 3ba0f32952
parent 41cf85bee7
37 changed files with 1878 additions and 19 deletions
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api.tsx
@ -13,6 +13,7 @@ import type { Conversation, Message } from '../assistant_context/types';
 import { API_ERROR } from './translations';
 import { MODEL_GPT_3_5_TURBO } from '../connectorland/models/model_selector/model_selector';
 import { getFormattedMessageContent } from './helpers';
+import { PerformEvaluationParams } from './settings/evaluation_settings/use_perform_evaluation';

 export interface FetchConnectorExecuteAction {
  assistantLangChain: boolean;
@ -199,3 +200,57 @@ export const deleteKnowledgeBase = async ({
    return error as IHttpFetchError;
  }
 };
+
+export interface PostEvaluationParams {
+  http: HttpSetup;
+  evalParams?: PerformEvaluationParams;
+  signal?: AbortSignal | undefined;
+}
+
+export interface PostEvaluationResponse {
+  success: boolean;
+}
+
+/**
+ * API call for evaluating models.
+ *
+ * @param {Object} options - The options object.
+ * @param {HttpSetup} options.http - HttpSetup
+ * @param {string} [options.evalParams] - Params necessary for evaluation
+ * @param {AbortSignal} [options.signal] - AbortSignal
+ *
+ * @returns {Promise<PostEvaluationResponse | IHttpFetchError>}
+ */
+export const postEvaluation = async ({
+  http,
+  evalParams,
+  signal,
+}: PostEvaluationParams): Promise<PostEvaluationResponse | IHttpFetchError> => {
+  try {
+    const path = `/internal/elastic_assistant/evaluate`;
+    const query = {
+      models: evalParams?.models.sort()?.join(','),
+      agents: evalParams?.agents.sort()?.join(','),
+      evaluationType: evalParams?.evaluationType.sort()?.join(','),
+      evalModel: evalParams?.evalModel.sort()?.join(','),
+      outputIndex: evalParams?.outputIndex,
+    };
+
+    const response = await http.fetch(path, {
+      method: 'POST',
+      body: JSON.stringify({
+        dataset: JSON.parse(evalParams?.dataset ?? '[]'),
+        evalPrompt: evalParams?.evalPrompt ?? '',
+      }),
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      query,
+      signal,
+    });
+
+    return response as PostEvaluationResponse;
+  } catch (error) {
+    return error as IHttpFetchError;
+  }
+};
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/assistant_settings.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/assistant_settings.tsx
@ -34,6 +34,7 @@ import { AdvancedSettings } from './advanced_settings/advanced_settings';
 import { ConversationSettings } from '../conversations/conversation_settings/conversation_settings';
 import { TEST_IDS } from '../constants';
 import { useSettingsUpdater } from './use_settings_updater/use_settings_updater';
+import { EvaluationSettings } from './evaluation_settings/evaluation_settings';

 const StyledEuiModal = styled(EuiModal)`
  width: 800px;
@ -45,13 +46,15 @@ export const QUICK_PROMPTS_TAB = 'QUICK_PROMPTS_TAB' as const;
 export const SYSTEM_PROMPTS_TAB = 'SYSTEM_PROMPTS_TAB' as const;
 export const ANONYMIZATION_TAB = 'ANONYMIZATION_TAB' as const;
 export const ADVANCED_TAB = 'ADVANCED_TAB' as const;
+export const EVALUATION_TAB = 'EVALUATION_TAB' as const;

 export type SettingsTabs =
  | typeof CONVERSATIONS_TAB
  | typeof QUICK_PROMPTS_TAB
  | typeof SYSTEM_PROMPTS_TAB
  | typeof ANONYMIZATION_TAB
-  | typeof ADVANCED_TAB;
+  | typeof ADVANCED_TAB
+  | typeof EVALUATION_TAB;
 interface Props {
  defaultConnectorId?: string;
  defaultProvider?: OpenAiProviderType;
@ -243,6 +246,16 @@ export const AssistantSettings: React.FC<Props> = React.memo(
                  <EuiIcon type="advancedSettingsApp" size="l" />
                </EuiKeyPadMenuItem>
              )}
+              {assistantLangChain && (
+                <EuiKeyPadMenuItem
+                  id={EVALUATION_TAB}
+                  label={i18n.EVALUATION_MENU_ITEM}
+                  isSelected={selectedSettingsTab === EVALUATION_TAB}
+                  onClick={() => setSelectedSettingsTab(EVALUATION_TAB)}
+                >
+                  <EuiIcon type="crossClusterReplicationApp" size="l" />
+                </EuiKeyPadMenuItem>
+              )}
            </EuiKeyPadMenu>
          </EuiPageSidebar>
          <EuiPageBody paddingSize="none" panelled={true}>
@ -295,6 +308,7 @@ export const AssistantSettings: React.FC<Props> = React.memo(
                  />
                )}
                {selectedSettingsTab === ADVANCED_TAB && <AdvancedSettings />}
+                {selectedSettingsTab === EVALUATION_TAB && <EvaluationSettings />}
              </EuiSplitPanel.Inner>
              <EuiSplitPanel.Inner
                grow={false}
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
@ -0,0 +1,368 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import React, { useCallback, useMemo, useState } from 'react';
+import {
+  euiPaletteComplementary,
+  EuiFormRow,
+  EuiTitle,
+  EuiText,
+  EuiHorizontalRule,
+  EuiSpacer,
+  EuiComboBox,
+  EuiButton,
+  EuiComboBoxOptionOption,
+  EuiTextArea,
+  EuiFieldText,
+  EuiFlexItem,
+  EuiFlexGroup,
+  EuiLink,
+} from '@elastic/eui';
+
+import { FormattedMessage } from '@kbn/i18n-react';
+import * as i18n from './translations';
+import { useAssistantContext } from '../../../assistant_context';
+import { useLoadConnectors } from '../../../connectorland/use_load_connectors';
+import { getActionTypeTitle, getGenAiConfig } from '../../../connectorland/helpers';
+import { PRECONFIGURED_CONNECTOR } from '../../../connectorland/translations';
+import { usePerformEvaluation } from './use_perform_evaluation';
+
+/**
+ * See AGENT_EXECUTOR_MAP in `x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts`
+ * for the agent name -> executor mapping
+ */
+const DEFAULT_AGENTS = ['DefaultAgentExecutor', 'OpenAIFunctionsExecutor'];
+const DEFAULT_EVAL_TYPES_OPTIONS = [
+  { label: 'correctness' },
+  { label: 'esql-validator', disabled: true },
+  { label: 'custom', disabled: true },
+];
+
+interface Props {
+  onEvaluationSettingsChange?: () => void;
+}
+
+/**
+ * Evaluation Settings -- development-only feature for evaluating models
+ */
+export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSettingsChange }) => {
+  const { actionTypeRegistry, basePath, http } = useAssistantContext();
+  const { data: connectors } = useLoadConnectors({ http });
+  const { mutate: performEvaluation, isLoading: isPerformingEvaluation } = usePerformEvaluation({
+    http,
+  });
+
+  // Connectors / Models
+  const [selectedModelOptions, setSelectedModelOptions] = useState<
+    Array<EuiComboBoxOptionOption<string>>
+  >([]);
+  const onModelOptionsChange = useCallback(
+    (selectedOptions: Array<EuiComboBoxOptionOption<string>>) => {
+      setSelectedModelOptions(selectedOptions);
+    },
+    [setSelectedModelOptions]
+  );
+  const visColorsBehindText = euiPaletteComplementary(connectors?.length ?? 0);
+  const modelOptions = useMemo(() => {
+    return (
+      connectors?.map((c, index) => {
+        const apiProvider = getGenAiConfig(c)?.apiProvider;
+        const connectorTypeTitle =
+          apiProvider ?? getActionTypeTitle(actionTypeRegistry.get(c.actionTypeId));
+        const connectorDetails = c.isPreconfigured ? PRECONFIGURED_CONNECTOR : connectorTypeTitle;
+        return {
+          key: c.id,
+          label: `${c.name} (${connectorDetails})`,
+          color: visColorsBehindText[index],
+        };
+      }) ?? []
+    );
+  }, [actionTypeRegistry, connectors, visColorsBehindText]);
+
+  // Agents
+  const [selectedAgentOptions, setSelectedAgentOptions] = useState<
+    Array<EuiComboBoxOptionOption<string>>
+  >([]);
+  const onAgentOptionsChange = useCallback(
+    (agentOptions: Array<EuiComboBoxOptionOption<string>>) => {
+      setSelectedAgentOptions(agentOptions);
+    },
+    [setSelectedAgentOptions]
+  );
+  const onAgentOptionsCreate = useCallback(
+    (searchValue: string) => {
+      const normalizedSearchValue = searchValue.trim();
+
+      if (!normalizedSearchValue) {
+        return;
+      }
+
+      setSelectedAgentOptions([...selectedAgentOptions, { label: normalizedSearchValue }]);
+    },
+    [selectedAgentOptions]
+  );
+  const agentOptions = useMemo(() => {
+    return DEFAULT_AGENTS.map((label) => ({ label }));
+  }, []);
+
+  // Evaluation Type
+  const [selectedEvaluationType, setSelectedEvaluationType] = useState<
+    Array<EuiComboBoxOptionOption<string>>
+  >([]);
+  const onEvaluationTypeChange = useCallback(
+    (evaluationType: Array<EuiComboBoxOptionOption<string>>) => {
+      setSelectedEvaluationType(evaluationType);
+    },
+    [setSelectedEvaluationType]
+  );
+  const onEvaluationTypeOptionsCreate = useCallback(
+    (searchValue: string) => {
+      const normalizedSearchValue = searchValue.trim();
+
+      if (!normalizedSearchValue) {
+        return;
+      }
+
+      setSelectedEvaluationType([{ label: normalizedSearchValue }]);
+    },
+    [setSelectedEvaluationType]
+  );
+  const evaluationTypeOptions = useMemo(() => {
+    return DEFAULT_EVAL_TYPES_OPTIONS;
+  }, []);
+
+  // Eval Model
+  const [selectedEvaluatorModelOptions, setSelectedEvaluatorModelOptions] = useState<
+    Array<EuiComboBoxOptionOption<string>>
+  >([]);
+  const onEvaluatorModelOptionsChange = useCallback(
+    (selectedOptions: Array<EuiComboBoxOptionOption<string>>) => {
+      setSelectedEvaluatorModelOptions(selectedOptions);
+    },
+    [setSelectedEvaluatorModelOptions]
+  );
+
+  // Output Index
+  const [outputIndex, setOutputIndex] = useState('.kibana-elastic-ai-assistant-evaluation-results');
+  const onOutputIndexChange = useCallback(
+    (e) => {
+      setOutputIndex(e.target.value);
+    },
+    [setOutputIndex]
+  );
+
+  // Eval Prompt
+  const sampleEvalPrompt: string = `For the below input: \n\n{{input}} \n\na prediction: \n\n{{prediction}} \n\nwas made. How's it stack up against this reference: \n\n{{reference}} \n\nReturn output in a succinct sentence ranking on a simple grading rubric focused on correctness.`;
+  const [evalPrompt, setEvalPrompt] = useState<string>(sampleEvalPrompt);
+  const onEvalPromptChange = useCallback(
+    (e) => {
+      setEvalPrompt(e.target.value);
+    },
+    [setEvalPrompt]
+  );
+
+  // Dataset
+  const sampleDataset = [
+    {
+      input:
+        'I want to see a query for metrics-apm*, filtering on metricset.name:transaction and metricset.interval:1m, showing the average duration (via transaction.duration.histogram), in 50 buckets. Only return the ESQL query, and do not wrap in a codeblock.',
+      reference:
+        'FROM metrics-apm*\n| WHERE metricset.name == ""transaction"" AND metricset.interval == ""1m""\n| EVAL bucket = AUTO_BUCKET(transaction.duration.histogram, 50, <start-date>, <end-date>)\n| STATS avg_duration = AVG(transaction.duration.histogram) BY bucket',
+    },
+  ];
+  const [datasetText, setDatasetText] = useState<string>(JSON.stringify(sampleDataset, null, 2));
+  const onDatasetTextChange = useCallback(
+    (e) => {
+      setDatasetText(e.target.value);
+    },
+    [setDatasetText]
+  );
+
+  const isPerformEvaluationDisabled =
+    selectedModelOptions.length === 0 ||
+    selectedAgentOptions.length === 0 ||
+    selectedEvaluatorModelOptions.length === 0 ||
+    selectedEvaluationType.length === 0 ||
+    datasetText.length === 0 ||
+    outputIndex.length === 0;
+
+  // Perform Evaluation Button
+  const handlePerformEvaluation = useCallback(() => {
+    const evalParams = {
+      models: selectedModelOptions.flatMap((option) => option.key ?? []),
+      agents: selectedAgentOptions.map((option) => option.label),
+      dataset: datasetText,
+      evalModel: selectedEvaluatorModelOptions.flatMap((option) => option.key ?? []),
+      evalPrompt,
+      evaluationType: selectedEvaluationType.map((option) => option.label),
+      outputIndex,
+    };
+    performEvaluation(evalParams);
+  }, [
+    datasetText,
+    evalPrompt,
+    outputIndex,
+    performEvaluation,
+    selectedAgentOptions,
+    selectedEvaluationType,
+    selectedEvaluatorModelOptions,
+    selectedModelOptions,
+  ]);
+
+  const discoverLink = `${basePath}/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:60000),time:(from:now-7d%2Fd,to:now))&_a=(columns:!('@timestamp',evaluationId,totalAgents,totalInput,totalRequests,input,reference,prediction,evaluation.value,evaluation.reasoning,predictionResponse.value.connector_id),filters:!(),grid:(columns:('@timestamp':(width:212),evaluationId:(width:285),totalAgents:(width:111),totalInput:(width:98),totalRequests:(width:121))),index:'6d9ba861-a76b-4d31-90f4-dfb8f01b78bd',interval:auto,query:(esql:'from%20.kibana-elastic-ai-assistant-evaluation-results%20%0A%7C%20keep%20@timestamp,%20evaluationId,%20totalAgents,%20totalInput,%20totalRequests,%20input,%20reference,%20prediction,%20evaluation.value,%20evaluation.reasoning,%20predictionResponse.value.connector_id%0A%7C%20sort%20@timestamp%20desc%0A%7C%20limit%20100%0A%0A%0A'),sort:!(!('@timestamp',desc)))`;
+
+  return (
+    <>
+      <EuiTitle size={'s'}>
+        <h2>{i18n.SETTINGS_TITLE}</h2>
+      </EuiTitle>
+      <EuiSpacer size="xs" />
+      <EuiText size={'s'}>{i18n.SETTINGS_DESCRIPTION}</EuiText>
+      <EuiHorizontalRule margin={'s'} />
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.CONNECTORS_LABEL}
+        helpText={i18n.CONNECTORS_DESCRIPTION}
+      >
+        <EuiComboBox
+          aria-label={'model-selector'}
+          compressed
+          options={modelOptions}
+          selectedOptions={selectedModelOptions}
+          onChange={onModelOptionsChange}
+        />
+      </EuiFormRow>
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.AGENTS_LABEL}
+        helpText={i18n.AGENTS_DESCRIPTION}
+      >
+        <EuiComboBox
+          aria-label={'agent-selector'}
+          compressed
+          onCreateOption={onAgentOptionsCreate}
+          options={agentOptions}
+          selectedOptions={selectedAgentOptions}
+          onChange={onAgentOptionsChange}
+        />
+      </EuiFormRow>
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.EVALUATOR_MODEL_LABEL}
+        helpText={i18n.EVALUATOR_MODEL_DESCRIPTION}
+      >
+        <EuiComboBox
+          aria-label={'evaluation-type-select'}
+          compressed
+          options={modelOptions}
+          selectedOptions={selectedEvaluatorModelOptions}
+          singleSelection={{ asPlainText: true }}
+          onChange={onEvaluatorModelOptionsChange}
+        />
+      </EuiFormRow>
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.EVALUATION_TYPE_LABEL}
+        helpText={i18n.EVALUATION_TYPE_DESCRIPTION}
+      >
+        <EuiComboBox
+          aria-label={'evaluation-type-select'}
+          compressed
+          onChange={onEvaluationTypeChange}
+          onCreateOption={onEvaluationTypeOptionsCreate}
+          options={evaluationTypeOptions}
+          selectedOptions={selectedEvaluationType}
+          singleSelection={{ asPlainText: true }}
+        />
+      </EuiFormRow>
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.EVALUATION_PROMPT_LABEL}
+        fullWidth
+        helpText={i18n.EVALUATION_PROMPT_DESCRIPTION}
+      >
+        <EuiTextArea
+          aria-label={'evaluation-prompt-textarea'}
+          compressed
+          disabled={selectedEvaluationType[0]?.label !== 'custom'}
+          fullWidth
+          onChange={onEvalPromptChange}
+          value={evalPrompt}
+        />
+      </EuiFormRow>
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.EVALUATOR_DATASET_LABEL}
+        fullWidth
+        helpText={i18n.EVALUATOR_DATASET_DESCRIPTION}
+      >
+        <EuiTextArea
+          aria-label={'evaluation-dataset-textarea'}
+          compressed
+          fullWidth
+          onChange={onDatasetTextChange}
+          value={datasetText}
+        />
+      </EuiFormRow>
+
+      <EuiFormRow
+        display="rowCompressed"
+        label={i18n.EVALUATOR_OUTPUT_INDEX_LABEL}
+        fullWidth
+        helpText={i18n.EVALUATOR_OUTPUT_INDEX_DESCRIPTION}
+      >
+        <EuiFieldText
+          value={outputIndex}
+          onChange={onOutputIndexChange}
+          aria-label="evaluation-output-index-textfield"
+        />
+      </EuiFormRow>
+
+      <EuiHorizontalRule />
+
+      <EuiFlexGroup alignItems="center">
+        <EuiFlexItem grow={false}>
+          <EuiButton
+            size="s"
+            type="submit"
+            isDisabled={isPerformEvaluationDisabled}
+            isLoading={isPerformingEvaluation}
+            onClick={handlePerformEvaluation}
+            fill
+          >
+            {i18n.PERFORM_EVALUATION}
+          </EuiButton>
+        </EuiFlexItem>
+        <EuiFlexItem>
+          <EuiText color={'subdued'} size={'xs'}>
+            <FormattedMessage
+              defaultMessage="Fun Facts: Watch the Kibana server logs for progress, and {funFacts} to view the results in Discover once complete. Will take (many) minutes depending on dataset, and closing this dialog will cancel the evaluation!"
+              id="xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText"
+              values={{
+                funFacts: (
+                  <EuiLink external href={discoverLink} target="_blank">
+                    {i18n.EVALUATOR_FUN_FACT_DISCOVER_LINK}
+                  </EuiLink>
+                ),
+              }}
+            />
+          </EuiText>
+        </EuiFlexItem>
+      </EuiFlexGroup>
+
+      <EuiSpacer size="s" />
+    </>
+  );
+});
+
+EvaluationSettings.displayName = 'EvaluationSettings';
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
@ -0,0 +1,137 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { i18n } from '@kbn/i18n';
+
+export const SETTINGS_TITLE = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.settingsTitle',
+  {
+    defaultMessage: 'Evaluation',
+  }
+);
+export const SETTINGS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.settingsDescription',
+  {
+    defaultMessage:
+      'Not-so-secret dev UI for evaluating sample datasets against models/agents/more...',
+  }
+);
+
+export const CONNECTORS_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsLabel',
+  {
+    defaultMessage: 'Connectors / Models',
+  }
+);
+
+export const CONNECTORS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsDescription',
+  {
+    defaultMessage: 'Select whichever models you want to evaluate the dataset against',
+  }
+);
+
+export const AGENTS_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsLabel',
+  {
+    defaultMessage: 'Agents',
+  }
+);
+
+export const AGENTS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsDescription',
+  {
+    defaultMessage: 'Select the agents (i.e. RAG algos) to evaluate the dataset against',
+  }
+);
+
+export const EVALUATOR_MODEL_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelLabel',
+  {
+    defaultMessage: 'Evaluator Model',
+  }
+);
+
+export const EVALUATOR_MODEL_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription',
+  {
+    defaultMessage: 'Model to perform the final evaluation with',
+  }
+);
+
+export const EVALUATION_TYPE_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel',
+  {
+    defaultMessage: 'Evaluation type',
+  }
+);
+
+export const EVALUATION_TYPE_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription',
+  {
+    defaultMessage:
+      'Type of evaluation to perform, e.g. "correctness" "esql-validator", or "custom" and provide your own evaluation prompt',
+  }
+);
+
+export const EVALUATION_PROMPT_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel',
+  {
+    defaultMessage: 'Evaluation prompt',
+  }
+);
+
+export const EVALUATION_PROMPT_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptDescription',
+  {
+    defaultMessage:
+      'Prompt template given `input`, `reference` and `prediction` template variables',
+  }
+);
+export const EVALUATOR_OUTPUT_INDEX_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexLabel',
+  {
+    defaultMessage: 'Output index',
+  }
+);
+
+export const EVALUATOR_OUTPUT_INDEX_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexDescription',
+  {
+    defaultMessage:
+      'Index to write results to. Must be prefixed with ".kibana-elastic-ai-assistant-"',
+  }
+);
+
+export const EVALUATOR_DATASET_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel',
+  {
+    defaultMessage: 'Dataset',
+  }
+);
+
+export const EVALUATOR_DATASET_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetDescription',
+  {
+    defaultMessage:
+      'Sample data set to evaluate. Array of objects with "input" and "references" properties',
+  }
+);
+
+export const PERFORM_EVALUATION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.performEvaluationTitle',
+  {
+    defaultMessage: 'Perform evaluation...',
+  }
+);
+
+export const EVALUATOR_FUN_FACT_DISCOVER_LINK = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText',
+  {
+    defaultMessage: 'click here',
+  }
+);
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/use_perform_evaluation.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/use_perform_evaluation.tsx
@ -0,0 +1,62 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { useMutation } from '@tanstack/react-query';
+import type { HttpSetup, IHttpFetchError, ResponseErrorBody } from '@kbn/core-http-browser';
+import type { IToasts } from '@kbn/core-notifications-browser';
+import { i18n } from '@kbn/i18n';
+import { postEvaluation } from '../../api';
+
+const PERFORM_EVALUATION_MUTATION_KEY = ['elastic-assistant', 'perform-evaluation'];
+
+export interface UsePerformEvaluationParams {
+  http: HttpSetup;
+  toasts?: IToasts;
+}
+
+export interface PerformEvaluationParams {
+  agents: string[];
+  dataset: string;
+  evalModel: string[];
+  evalPrompt: string;
+  evaluationType: string[];
+  models: string[];
+  outputIndex: string;
+}
+
+/**
+ * Hook for performing model evaluations
+ *
+ * @param {Object} options - The options object.
+ * @param {HttpSetup} options.http - HttpSetup
+ * @param {IToasts} [options.toasts] - IToasts
+ *
+ * @returns {useMutation} mutation hook for setting up the Knowledge Base
+ */
+export const usePerformEvaluation = ({ http, toasts }: UsePerformEvaluationParams) => {
+  return useMutation(
+    PERFORM_EVALUATION_MUTATION_KEY,
+    (evalParams?: PerformEvaluationParams | void) => {
+      // Optional params workaround: see: https://github.com/TanStack/query/issues/1077#issuecomment-1431247266
+      return postEvaluation({ http, evalParams: evalParams ?? undefined });
+    },
+    {
+      onError: (error: IHttpFetchError<ResponseErrorBody>) => {
+        if (error.name !== 'AbortError') {
+          toasts?.addError(
+            error.body && error.body.message ? new Error(error.body.message) : error,
+            {
+              title: i18n.translate('xpack.elasticAssistant.evaluation.evaluationError', {
+                defaultMessage: 'Error performing evaluation...',
+              }),
+            }
+          );
+        }
+      },
+    }
+  );
+};
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/translations.ts
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/translations.ts
@ -56,6 +56,13 @@ export const ADVANCED_MENU_ITEM = i18n.translate(
  }
 );

+export const EVALUATION_MENU_ITEM = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.settingsEvaluationMenuItemTitle',
+  {
+    defaultMessage: 'Evaluation',
+  }
+);
+
 export const ADD_SYSTEM_PROMPT_MODAL_TITLE = i18n.translate(
  'xpack.elasticAssistant.assistant.settings.modalTitle',
  {
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant_context/index.test.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant_context/index.test.tsx
@ -32,6 +32,7 @@ const ContextWrapper: React.FC = ({ children }) => (
    augmentMessageCodeBlocks={jest.fn()}
    baseAllow={[]}
    baseAllowReplacement={[]}
+    basePath={'https://localhost:5601/kbn'}
    defaultAllow={[]}
    defaultAllowReplacement={[]}
    docLinks={{
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant_context/index.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant_context/index.tsx
@ -56,6 +56,7 @@ export interface AssistantProviderProps {
  baseAllowReplacement: string[];
  defaultAllow: string[];
  defaultAllowReplacement: string[];
+  basePath: string;
  basePromptContexts?: PromptContextTemplate[];
  baseQuickPrompts?: QuickPrompt[];
  baseSystemPrompts?: Prompt[];
@ -92,6 +93,7 @@ export interface UseAssistantContext {
  docLinks: Omit<DocLinksStart, 'links'>;
  defaultAllow: string[];
  defaultAllowReplacement: string[];
+  basePath: string;
  basePromptContexts: PromptContextTemplate[];
  baseQuickPrompts: QuickPrompt[];
  baseSystemPrompts: Prompt[];
@ -139,6 +141,7 @@ export const AssistantProvider: React.FC<AssistantProviderProps> = ({
  defaultAllow,
  defaultAllowReplacement,
  docLinks,
+  basePath,
  basePromptContexts = [],
  baseQuickPrompts = [],
  baseSystemPrompts = BASE_SYSTEM_PROMPTS,
@ -258,6 +261,7 @@ export const AssistantProvider: React.FC<AssistantProviderProps> = ({
      allSystemPrompts: localStorageSystemPrompts ?? [],
      baseAllow: uniq(baseAllow),
      baseAllowReplacement: uniq(baseAllowReplacement),
+      basePath,
      basePromptContexts,
      baseQuickPrompts,
      baseSystemPrompts,
@ -293,6 +297,7 @@ export const AssistantProvider: React.FC<AssistantProviderProps> = ({
      augmentMessageCodeBlocks,
      baseAllow,
      baseAllowReplacement,
+      basePath,
      basePromptContexts,
      baseQuickPrompts,
      baseSystemPrompts,
--- a/x-pack/packages/kbn-elastic-assistant/impl/mock/test_providers/test_providers.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/mock/test_providers/test_providers.tsx
@ -76,6 +76,7 @@ export const TestProvidersComponent: React.FC<Props> = ({
            augmentMessageCodeBlocks={jest.fn().mockReturnValue([])}
            baseAllow={[]}
            baseAllowReplacement={[]}
+            basePath={'https://localhost:5601/kbn'}
            defaultAllow={[]}
            defaultAllowReplacement={[]}
            docLinks={{
--- a/x-pack/packages/security-solution/ecs_data_quality_dashboard/impl/data_quality/mock/test_providers/test_providers.tsx
+++ b/x-pack/packages/security-solution/ecs_data_quality_dashboard/impl/data_quality/mock/test_providers/test_providers.tsx
@ -50,6 +50,7 @@ export const TestProvidersComponent: React.FC<Props> = ({ children, isILMAvailab
          augmentMessageCodeBlocks={jest.fn()}
          baseAllow={[]}
          baseAllowReplacement={[]}
+          basePath={'https://localhost:5601/kbn'}
          defaultAllow={[]}
          defaultAllowReplacement={[]}
          docLinks={{
--- a/x-pack/plugins/elastic_assistant/common/constants.ts
+++ b/x-pack/plugins/elastic_assistant/common/constants.ts
@ -14,3 +14,6 @@ export const POST_ACTIONS_CONNECTOR_EXECUTE = `${BASE_PATH}/actions/connector/{c

 // Knowledge Base
 export const KNOWLEDGE_BASE = `${BASE_PATH}/knowledge_base/{resource?}`;
+
+// Model Evaluation
+export const EVALUATE = `${BASE_PATH}/evaluate`;
--- a/x-pack/plugins/elastic_assistant/package.json
+++ b/x-pack/plugins/elastic_assistant/package.json
@ -0,0 +1,10 @@
+{
+  "author": "Elastic",
+  "name": "@kbn/elastic-assistant-plugin",
+  "version": "1.0.0",
+  "private": true,
+  "license": "Elastic License 2.0",
+  "scripts": {
+    "evaluate-model": "node ./scripts/model_evaluator"
+  }
+}
--- a/x-pack/plugins/elastic_assistant/scripts/model_evaluator.js
+++ b/x-pack/plugins/elastic_assistant/scripts/model_evaluator.js
@ -0,0 +1,9 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+require('../../../../src/setup_node_env');
+require('./model_evaluator_script').evaluateModels();
--- a/x-pack/plugins/elastic_assistant/scripts/model_evaluator_script.ts
+++ b/x-pack/plugins/elastic_assistant/scripts/model_evaluator_script.ts
@ -0,0 +1,54 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import yargs from 'yargs/yargs';
+import { ToolingLog } from '@kbn/tooling-log';
+
+export const AVAILABLE_MODELS = ['gpt-3.5', 'gpt-4'] as const;
+
+/**
+ * Work in progress developer script for evaluating models against datasets.
+ *
+ * Companion to the `elastic_assistant/evaluate` endpoint for running evaluations
+ * in the CLI using `yarn evaluate-model`.
+ *
+ * TODO: Finalize inputs and call to `performEvaluation`
+ */
+export const evaluateModels = () => {
+  const logger = new ToolingLog({
+    level: 'info',
+    writeTo: process.stdout,
+  });
+  logger.info('Starting model evaluator script');
+
+  yargs(process.argv.slice(2))
+    .command(
+      '*',
+      'Evaluate an input dataset against connectors / models + agents',
+      (y) =>
+        y
+          .option('agents', {
+            describe: 'Agents to evaluate the dataset against',
+            demandOption: false,
+            string: true,
+          })
+          .option('models', {
+            describe: 'Template to use for code generation',
+            default: 'gpt-3.5' as const,
+            choices: AVAILABLE_MODELS,
+          })
+          .showHelpOnFail(false),
+      (argv) => {
+        // performEvaluation({ dataset: DEFAULT_DATASET, logger }).catch((err) => {
+        //   logger.error(err);
+        //   // eslint-disable-next-line no-process-exit
+        //   process.exit(1);
+        // });
+      }
+    )
+    .parse();
+};
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.ts
@ -5,35 +5,26 @@
 * 2.0.
 */

-import { ElasticsearchClient, KibanaRequest, Logger } from '@kbn/core/server';
-import type { PluginStartContract as ActionsPluginStart } from '@kbn/actions-plugin/server';
 import { initializeAgentExecutorWithOptions } from 'langchain/agents';
 import { RetrievalQAChain } from 'langchain/chains';
 import { BufferMemory, ChatMessageHistory } from 'langchain/memory';
-import { BaseMessage } from 'langchain/schema';
 import { ChainTool, Tool } from 'langchain/tools';

 import { ElasticsearchStore } from '../elasticsearch_store/elasticsearch_store';
-import { RequestBody, ResponseBody } from '../types';
 import { ActionsClientLlm } from '../llm/actions_client_llm';
 import { KNOWLEDGE_BASE_INDEX_PATTERN } from '../../../routes/knowledge_base/constants';
+import type { AgentExecutorParams, AgentExecutorResponse } from '../executors/types';

 export const callAgentExecutor = async ({
  actions,
  connectorId,
  esClient,
  langChainMessages,
+  llmType,
  logger,
  request,
-}: {
-  actions: ActionsPluginStart;
-  connectorId: string;
-  esClient: ElasticsearchClient;
-  langChainMessages: BaseMessage[];
-  logger: Logger;
-  request: KibanaRequest<unknown, unknown, RequestBody>;
-}): Promise<ResponseBody> => {
-  const llm = new ActionsClientLlm({ actions, connectorId, request, logger });
+}: AgentExecutorParams): AgentExecutorResponse => {
+  const llm = new ActionsClientLlm({ actions, connectorId, request, llmType, logger });

  const pastMessages = langChainMessages.slice(0, -1); // all but the last message
  const latestMessage = langChainMessages.slice(-1); // the last message
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts
@ -0,0 +1,72 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { initializeAgentExecutorWithOptions } from 'langchain/agents';
+import { RetrievalQAChain } from 'langchain/chains';
+import { BufferMemory, ChatMessageHistory } from 'langchain/memory';
+import { ChainTool, Tool } from 'langchain/tools';
+
+import { ElasticsearchStore } from '../elasticsearch_store/elasticsearch_store';
+import { ActionsClientLlm } from '../llm/actions_client_llm';
+import { KNOWLEDGE_BASE_INDEX_PATTERN } from '../../../routes/knowledge_base/constants';
+import type { AgentExecutorParams, AgentExecutorResponse } from './types';
+
+/**
+ * This is an agent executor to be used with the model evaluation API for benchmarking.
+ * Currently just a copy of `callAgentExecutor`, but using the `openai-functions` agent type.
+ *
+ * NOTE: This is not to be used in production as-is, and must be used with an OpenAI ConnectorId
+ */
+export const callOpenAIFunctionsExecutor = async ({
+  actions,
+  connectorId,
+  esClient,
+  langChainMessages,
+  llmType,
+  logger,
+  request,
+}: AgentExecutorParams): AgentExecutorResponse => {
+  const llm = new ActionsClientLlm({ actions, connectorId, request, llmType, logger });
+
+  const pastMessages = langChainMessages.slice(0, -1); // all but the last message
+  const latestMessage = langChainMessages.slice(-1); // the last message
+
+  const memory = new BufferMemory({
+    chatHistory: new ChatMessageHistory(pastMessages),
+    memoryKey: 'chat_history', // this is the key expected by https://github.com/langchain-ai/langchainjs/blob/a13a8969345b0f149c1ca4a120d63508b06c52a5/langchain/src/agents/initialize.ts#L166
+    inputKey: 'input',
+    outputKey: 'output',
+    returnMessages: true,
+  });
+
+  // ELSER backed ElasticsearchStore for Knowledge Base
+  const esStore = new ElasticsearchStore(esClient, KNOWLEDGE_BASE_INDEX_PATTERN, logger);
+  const chain = RetrievalQAChain.fromLLM(llm, esStore.asRetriever());
+
+  const tools: Tool[] = [
+    new ChainTool({
+      name: 'esql-language-knowledge-base',
+      description:
+        'Call this for knowledge on how to build an ESQL query, or answer questions about the ES|QL query language.',
+      chain,
+    }),
+  ];
+
+  const executor = await initializeAgentExecutorWithOptions(tools, llm, {
+    agentType: 'openai-functions',
+    memory,
+    verbose: false,
+  });
+
+  await executor.call({ input: latestMessage[0].content });
+
+  return {
+    connector_id: connectorId,
+    data: llm.getActionResultData(), // the response from the actions framework
+    status: 'ok',
+  };
+};
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/types.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/types.ts
@ -0,0 +1,29 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { PluginStartContract as ActionsPluginStart } from '@kbn/actions-plugin/server';
+import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
+import { BaseMessage } from 'langchain/schema';
+import { Logger } from '@kbn/logging';
+import { KibanaRequest } from '@kbn/core-http-server';
+import { RequestBody, ResponseBody } from '../types';
+
+export interface AgentExecutorParams {
+  actions: ActionsPluginStart;
+  connectorId: string;
+  esClient: ElasticsearchClient;
+  langChainMessages: BaseMessage[];
+  llmType?: string;
+  logger: Logger;
+  request: KibanaRequest<unknown, unknown, RequestBody>;
+}
+
+export type AgentExecutorResponse = Promise<ResponseBody>;
+
+export type AgentExecutor = (params: AgentExecutorParams) => AgentExecutorResponse;
+
+export type AgentExecutorEvaluator = (langChainMessages: BaseMessage[]) => AgentExecutorResponse;
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/llm/actions_client_llm.test.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/llm/actions_client_llm.test.ts
@ -87,6 +87,18 @@ describe('ActionsClientLlm', () => {

      expect(actionsClientLlm._llmType()).toEqual('ActionsClientLlm');
    });
+
+    it('returns the expected LLM type when overridden', () => {
+      const actionsClientLlm = new ActionsClientLlm({
+        actions: mockActions,
+        connectorId,
+        llmType: 'special-llm-type',
+        logger: mockLogger,
+        request: mockRequest,
+      });
+
+      expect(actionsClientLlm._llmType()).toEqual('special-llm-type');
+    });
  });

  describe('_call', () => {
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/llm/actions_client_llm.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/llm/actions_client_llm.ts
@ -22,14 +22,20 @@ export class ActionsClientLlm extends LLM {
  #request: KibanaRequest<unknown, unknown, RequestBody>;
  #actionResultData: string;

+  // Local `llmType` as it can change and needs to be accessed by abstract `_llmType()` method
+  // Not using getter as `this._llmType()` is called in the constructor via `super({})`
+  protected llmType: string;
+
  constructor({
    actions,
    connectorId,
+    llmType,
    logger,
    request,
  }: {
    actions: ActionsPluginStart;
    connectorId: string;
+    llmType?: string;
    logger: Logger;
    request: KibanaRequest<unknown, unknown, RequestBody>;
  }) {
@ -37,6 +43,7 @@ export class ActionsClientLlm extends LLM {

    this.#actions = actions;
    this.#connectorId = connectorId;
+    this.llmType = llmType ?? LLM_TYPE;
    this.#logger = logger;
    this.#request = request;
    this.#actionResultData = '';
@ -47,14 +54,21 @@ export class ActionsClientLlm extends LLM {
  }

  _llmType() {
-    return LLM_TYPE;
+    return this.llmType;
+  }
+
+  // Model type needs to be `base_chat_model` to work with LangChain OpenAI Tools
+  // We may want to make this configurable (ala _llmType) if different agents end up requiring different model types
+  // See: https://github.com/langchain-ai/langchainjs/blob/fb699647a310c620140842776f4a7432c53e02fa/langchain/src/agents/openai/index.ts#L185
+  _modelType() {
+    return 'base_chat_model';
  }

  async _call(prompt: string): Promise<string> {
    // convert the Langchain prompt to an assistant message:
    const assistantMessage = getMessageContentAndRole(prompt);
    this.#logger.debug(
-      `ActionsClientLlm#_call assistantMessage:\n ${JSON.stringify(assistantMessage)} `
+      `ActionsClientLlm#_call assistantMessage:\n${JSON.stringify(assistantMessage)} `
    );
    // create a new connector request body with the assistant message:
    const requestBody = {
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/esql_dataset.json
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/esql_dataset.json
@ -0,0 +1,126 @@
+[
+  {
+    "input": "Generate an ES|QL query that will count the number of connections made to external IP addresses, broken down by user. If the count is greater than 100 for a specific user, add a new field called \"follow_up\" that contains a value of \"true\", otherwise, it should contain \"false\". The user names should also be enriched with their respective group names.",
+    "reference": "FROM logs-*\n| WHERE NOT CIDR_MATCH(destination.ip, \"10.0.0.0/8\", \"172.16.0.0/12\", \"192.168.0.0/16\")\n| STATS destcount = COUNT(destination.ip) by user.name, host.name\n| ENRICH ldap_lookup_new ON user.name\n| WHERE group.name IS NOT NULL\n| EVAL follow_up = CASE(\n    destcount >= 100, \"true\",\n     \"false\")\n| SORT destcount desc\n| KEEP destcount, host.name, user.name, group.name, follow_up"
+  },
+  {
+    "input": "Generate an ES|QL query that will parse the DNS registered domain from a DNS query, count the number distinct DNS queries being made per DNS registered domain and filter for when the distinct count is greater than 5. The query should sort the results by the distinct count of queries in descending order.",
+    "reference": "from logs-*\n| grok dns.question.name \"%{DATA}\\\\.%{GREEDYDATA:dns.question.registered_domain:string}\"\n| stats unique_queries = count_distinct(dns.question.name) by dns.question.registered_domain, process.name\n| where unique_queries > 5\n| sort unique_queries desc"
+  },
+  {
+    "input": "Generate an ES|QL query that will filter all logs for those containing windows event codes, count them by host name, and enrich the codes with a description of what the code means (via an enrichment policy called \"win_events\"). The results should be sorted by the count of each code in descending order",
+    "reference": "from logs-*\n| where event.code is not null\n| stats event_code_count = count(event.code) by event.code,host.name\n| enrich win_events on event.code with EVENT_DESCRIPTION\n| where EVENT_DESCRIPTION is not null and host.name is not null\n| rename EVENT_DESCRIPTION as event.description\n| sort event_code_count desc\n| keep event_code_count,event.code,host.name,event.description\""
+  },
+  {
+    "input": "Generate an ES|QL query that will filter for file creation events. Count the number of file interactions by the process interacting with the file, and the host name. From the process name field, parse and output two new fields that represent the process and the process extension separately. Calculate the length of the process name and filter for events where the length is greater than 15 characters. Sort the result based on the process length and filecount in descending order. Limit the results to the top 10.",
+    "reference": "from logs-*\n| where event.category == \"file\" and event.action == \"creation\"\n| stats filecount = count(file.name) by process.name,host.name\n| dissect process.name \"%{process}.%{extension}\" \n| eval proclength = length(process.name)\n| where proclength > 10 \n| sort filecount,proclength desc\n| limit 10 \n| keep host.name,process.name,filecount,process,extension,fullproc,proclength"
+  },
+  {
+    "input": "Generate an ES|QL query that will look for all process events for the process \"curl.exe\". Calculate the sum of outbund bytes for this process by the destination address. Output the results in KB, also sorted by KB in descending order. Limit to the top 10 results.",
+    "reference": "from logs-*\n| where process.name == \"curl.exe\"\n| stats bytes = sum(destination.bytes) by destination.address\n| eval kb =  bytes/1024\n| sort kb desc\n| limit 10\n| keep kb,destination.address"
+  },
+  {
+    "input": "I want to see a query for metrics-apm*, filtering on metricset.name:transaction and metricset.interval:1m, showing the average duration (via transaction.duration.histogram), in 50 buckets.",
+    "reference": "FROM metrics-apm*\n| WHERE metricset.name == \"transaction\" AND metricset.interval == \"1m\"\n| EVAL bucket = AUTO_BUCKET(transaction.duration.histogram, 50, <start-date>, <end-date>)\n| STATS avg_duration = AVG(transaction.duration.histogram) BY bucket"
+  },
+  {
+    "input": "For standard Elastic ECS compliant packetbeat data view, create an ES|QL query that shows the top 10 unique domains by doc count",
+    "reference": "FROM packetbeat-*\n| STATS doc_count = COUNT(destination.domain) BY destination.domain\n| SORT doc_count DESC\n| LIMIT 10"
+  },
+  {
+    "input": "From employees, I want to see the 5 earliest employees (hire_date), I want to display only the month and the year that they were hired in and their employee number (emp_no). Format the date as e.g. \"September 2019\". Only show the query",
+    "reference": "FROM employees\n| EVAL hire_date_formatted = DATE_FORMAT(hire_date, \"MMMM yyyy\")\n| SORT hire_date\n| KEEP emp_no, hire_date_formatted\n| LIMIT 5"
+  },
+  {
+    "input": "From employees, I want to sort the documents by salary, and then return 10 results per page, and then see the second page",
+    "reference": "Pagination is not supported"
+  },
+  {
+    "input": "My logs data (ECS) is in `logs-*`. Show me a query that gets the average CPU per host, limit it to the top 10 results, in 1m buckets, and only include the last 15m.",
+    "reference": "FROM logs-*\n| WHERE @timestamp >= NOW() - 15 minutes\n| EVAL bucket = DATE_TRUNC(1 minute, @timestamp)\n| STATS avg_cpu = AVG(system.cpu.total.norm.pct) BY bucket, host.name\n| LIMIT 10"
+  },
+  {
+    "input": "I want to show a list of services with APM data. My data is in `traces-apm*`. I want to show the average transaction duration, the success rate (by dividing event.outcome:failure by event.outcome:failure+success), and total amount of requests. As a time range, select the last 24 hours. Just show me the query.",
+    "reference": "FROM traces-apm*\n| WHERE @timestamp >= NOW() - 24 hours\n| EVAL successful = CASE(event.outcome == \"success\", 1, 0),\n  failed = CASE(event.outcome == \"failure\", 1, 0)\n| STATS success_rate = AVG(successful), \n  avg_duration = AVG(transaction.duration), \n  total_requests = COUNT(transaction.id) BY service.name"
+  },
+  {
+    "input": "from `metricbeat*`, I want to see the percentage of CPU time normalized by the number of CPU cores, broken down by hostname. the fields are system.cpu.user.pct, system.cpu.system.pct, and system.cpu.cores. just show me the query",
+    "reference": "FROM metricbeat*\n| EVAL cpu_pct_normalized = (system.cpu.user.pct + system.cpu.system.pct) / system.cpu.cores\n| STATS AVG(cpu_pct_normalized) BY host.name"
+  },
+  {
+    "input": "I want to see a query that does the following: extract the query duration from postgres log messages, and calculate the avg",
+    "reference": "FROM postgres-logs\n| DISSECT message \"%{} duration: %{query_duration} ms\"\n| EVAL query_duration_num = TO_DOUBLE(query_duration)\n| STATS avg_duration = AVG(query_duration_num)"
+  },
+  {
+    "input": "From `nyc_taxis`, give me the top 10 results where the drop off time was between 6am and 10am. Just give me the query.",
+    "reference": "FROM nyc_taxis\n| WHERE DATE_EXTRACT(drop_off_time, \"hour\") >= 6 AND DATE_EXTRACT(drop_off_time, \"hour\") < 10\n| LIMIT 10"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Failed Logins from a Single IP",
+    "reference": "FROM logs-*\n| WHERE event.action == \"failed_login\" \n| STATS login_counts = COUNT(event.action) by source.ip\n| WHERE login_counts > 5"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Large Data Transfers\n\n",
+    "reference": "FROM logs-*\n| WHERE network.bytes > 1000000\n| KEEP source.ip, destination.ip, network.bytes"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Rare User Agents\n\n\n\n",
+    "reference": "FROM logs-*\n| STATS user_agent_count = COUNT(user_agent.original) by user_agent.original\n| WHERE user_agent_count < 5\n| KEEP user_agent.original"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Potential Beaconing Activity\n\n\n\n",
+    "reference": "FROM logs-*\n| STATS domain_requests = COUNT(url.domain) by source.ip, domain\n| WHERE domain_requests > 100\n| KEEP source.ip, url.domain"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Uncommon Processes",
+    "reference": "FROM logs-*\n| STATS process_count = COUNT(process.name) by process.name\n| WHERE process_count < 3\n| KEEP process.name"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nLocating Suspicious Outbound Connections",
+    "reference": "FROM logs-*\n| WHERE destination.port NOT IN (80, 443) and direction == \"outbound\"\n| KEEP source.ip, destination.ip, destination.port"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Processes Running from Temporary Directories",
+    "reference": "FROM logs-*\n| WHERE process.working_directory RLIKE \"/tmp.*\"\n| KEEP process.name, process.working_directory"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Connections to Non-Standard SSH Ports\n\n",
+    "reference": "FROM logs-*\n| WHERE destination.port NOT IN (22) AND process.name == \"ssh\"\n| KEEP source.ip, destination.ip, destination.port"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Potential Phishing Domains\n\n",
+    "reference": "FROM logs-*\n| WHERE url.domain RLIKE \".*paypa1.*|.*banking.*\"\n| KEEP source.ip, url.domain"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nLocating Suspicious Concatenated Strings in Logs (Possible Script or Malware)",
+    "reference": "FROM logs-*\n| WHERE CONCAT(\"evil\", \"payload\") IN log.message\n| KEEP log.message"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nUncovering Connections to Non-Standard SSH Ports Outside of Local Network",
+    "reference": "FROM logs-*\n| WHERE NOT CIDR_MATCH(destination.ip, \"10.0.0.0/8\", \"192.168.0.0/16\")\n| KEEP source.ip, destination.ip, network.bytes"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nUncovering Connections to Non-Standard SSH Ports Outside of Local Network",
+    "reference": "FROM logs-*\n| WHERE CIDR_MATCH(ip, \"10.0.0.0/8\", \"192.168.0.0/16\")\n| DISSECT user_agent \"%{browser_name}/%{browser_version} (%{os_name}; %{os_version})\""
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Potential PowerShell Exploits\n\n",
+    "reference": "FROM logs-*\n| WHERE process.name == \"powershell.exe\" and process.command_line RLIKE \".*EncodedCommand.*\"\n| KEEP host.name, user.name, command_line, @timestamp"
+  },
+  {
+    "input": "Generate an ES|QL query that will :\nCategorize Data Transfer Sizes\n\n",
+    "reference": "FROM logs-*\n| EVAL transfer_category = CASE(\n    bytes_transferred < 1000, \"Small\",\n    bytes_transferred < 100000, \"Medium\",\n    \"Large\")\n| STATS transfer_count = COUNT(transfer_category) by transfer_category\n| KEEP transfer_category, transfer_count"
+  },
+  {
+    "input": "Generate an ES|QL query that will:\nCategorizing Failed Logins by Reason",
+    "reference": "FROM logs-*\n| WHERE event.action == \"failed_login\"\n| EVAL failure_reason = CASE(\n    error_code == \"404\", \"Not Found\",\n    error_code == \"403\", \"Forbidden\",\n    \"Other\")\n| STATS failure_count = COUNT(failure_reason) by failure_reason\n| KEEP failure_reason, failure_count"
+  },
+  {
+    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Distinct IP counts Accessing Rare Endpoints",
+    "reference": "FROM logs-*\n| WHERE url.path RLIKE \"/admin/*\"\n| STATS unique_count = COUNT_DISTINCT(source.ip) by source.ip\n| KEEP unique_count, source.ip"
+  },
+  {
+    "input": "Generate an ES|QL query that will:\nIdentify Data Exfiltration by Monitoring Outgoing Data Size",
+    "reference": "FROM logs-*\n| WHERE network.direction == \"outbound\"\n| STATS data_size = SUM(bytes_transferred) by destination.ip\n| WHERE data_size > 1000000\n| KEEP destination.ip, data_size"
+  }
+]
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/query_dataset.json
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/query_dataset.json
@ -0,0 +1,38 @@
+[
+  {
+    "input": "Generate an EQL Query to detect data exfiltration attempts on linux systems. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "This query looks for a sequence of events, first for a shell process (e.g., bash, zsh, etc.) execution, and then for an outbound network connection from that process within a specified list of ports commonly used for data exfiltration. \n\nsequence by process.entity_id\n  [process\n    where process.name : (\"bash\", \"sh\", \"zsh\", \"dash\", \"ksh\", \"tcsh\", \"fish\", \"csh\", \"pwsh\")\n  ]\n  [network\n    where network.direction == \"outbound\" and\n    destination.port : (20, 21, 22, 25, 80, 110, 143, 443, 465, 587, 993, 995, 3389, 5601)\n  ]"
+  },
+  {
+    "input": "Generate an EQL query which will allow me to detect brute force attempts against my web servers.The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "This query looks for a sequence of events where three consecutive failed authentication attempts from the same source IP are followed by a successful login within a 5-minute span.\n\n sequence by source.ip with maxspan=5m\n  [ authentication where event.outcome == \"failure\" ] with runs=3\n  [ authentication where event.outcome == \"success\" ]\n"
+  },
+  {
+    "input": "Generate an EQL query that will detect suspicious process activity from Microsoft office programs. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Identifies suspicious child processes of frequently targeted Microsoft Office applications (Word, PowerPoint, Excel). These child processes are often launched during exploitation of Office applications or from documents with malicious macros.\n\nprocess where host.os.type == \"windows\" and event.type == \"start\" and\n  process.parent.name : (\"eqnedt32.exe\", \"excel.exe\", \"fltldr.exe\", \"msaccess.exe\", \"mspub.exe\", \"powerpnt.exe\", \"winword.exe\", \"outlook.exe\") and\n  process.name : (\"Microsoft.Workflow.Compiler.exe\", \"arp.exe\", \"atbroker.exe\", \"bginfo.exe\", \"bitsadmin.exe\", \"cdb.exe\", \"certutil.exe\",\n                \"cmd.exe\", \"cmstp.exe\", \"control.exe\", \"cscript.exe\", \"csi.exe\", \"dnx.exe\", \"dsget.exe\", \"dsquery.exe\", \"forfiles.exe\",\n                \"fsi.exe\", \"ftp.exe\", \"gpresult.exe\", \"hostname.exe\", \"ieexec.exe\", \"iexpress.exe\", \"installutil.exe\", \"ipconfig.exe\",\n                \"mshta.exe\", \"msxsl.exe\", \"nbtstat.exe\", \"net.exe\", \"net1.exe\", \"netsh.exe\", \"netstat.exe\", \"nltest.exe\", \"odbcconf.exe\",\n                \"ping.exe\", \"powershell.exe\", \"pwsh.exe\", \"qprocess.exe\", \"quser.exe\", \"qwinsta.exe\", \"rcsi.exe\", \"reg.exe\", \"regasm.exe\",\n                \"regsvcs.exe\", \"regsvr32.exe\", \"sc.exe\", \"schtasks.exe\", \"systeminfo.exe\", \"tasklist.exe\", \"tracert.exe\", \"whoami.exe\",\n                \"wmic.exe\", \"wscript.exe\", \"xwizard.exe\", \"explorer.exe\", \"rundll32.exe\", \"hh.exe\", \"msdt.exe\")"
+  },
+  {
+    "input": "Generate an EQL query that will detect the enumeration of files and directories using built-in tools. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Enumeration of files and directories using built-in tools. Adversaries may use the information discovered to plan follow-on activity.\n\n\nsequence by agent.id, user.name with maxspan=1m\n[process where event.type in (\"start\", \"process_started\") and\n  ((process.name : \"cmd.exe\" or process.pe.original_file_name == \"Cmd.Exe\") and process.args : \"dir\") or\n    process.name : \"tree.com\"]\n[process where event.type in (\"start\", \"process_started\") and\n  ((process.name : \"cmd.exe\" or process.pe.original_file_name == \"Cmd.Exe\") and process.args : \"dir\") or\n    process.name : \"tree.com\"]\n[process where event.type in (\"start\", \"process_started\") and\n  ((process.name : \"cmd.exe\" or process.pe.original_file_name == \"Cmd.Exe\") and process.args : \"dir\") or\n    process.name : \"tree.com\"]"
+  },
+  {
+    "input": "Generate an EQL query that will detect unusual child proceses of RunDLL32. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Identifies child processes of unusual instances of RunDLL32 where the command line parameters were suspicious. Misuse of RunDLL32 could indicate malicious activity.\n\nsequence with maxspan=1h\n  [process where host.os.type == \"windows\" and event.type == \"start\" and\n     (process.name : \"rundll32.exe\" or process.pe.original_file_name == \"RUNDLL32.EXE\") and\n      process.args_count == 1\n  ] by process.entity_id\n  [process where host.os.type == \"windows\" and event.type == \"start\" and process.parent.name : \"rundll32.exe\"\n  ] by process.parent.entity_id"
+  },
+  {
+    "input": "Generate an EQL query that will detect Multiple Logon Failures Followed by Logon Success. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Identifies multiple logon failures followed by a successful one from the same source address. Adversaries will often brute force login attempts across multiple users with a common or known password, in an attempt to gain access to accounts.\n\nsequence by winlog.computer_name, source.ip with maxspan=5s\n  [authentication where event.action == \"logon-failed\" and\n    /* event 4625 need to be logged */\n    winlog.logon.type : \"Network\" and\n    source.ip != null and source.ip != \"127.0.0.1\" and source.ip != \"::1\" and\n    not user.name : (\"ANONYMOUS LOGON\", \"-\", \"*$\") and not user.domain == \"NT AUTHORITY\" and\n\n    /* noisy failure status codes often associated to authentication misconfiguration */\n    not winlog.event_data.Status : (\"0xC000015B\", \"0XC000005E\", \"0XC0000133\", \"0XC0000192\")] with runs=5\n  [authentication where event.action == \"logged-in\" and\n    /* event 4624 need to be logged */\n    winlog.logon.type : \"Network\" and\n    source.ip != null and source.ip != \"127.0.0.1\" and source.ip != \"::1\" and\n    not user.name : (\"ANONYMOUS LOGON\", \"-\", \"*$\") and not user.domain == \"NT AUTHORITY\"]"
+  },
+  {
+    "input": "Generate an EQL query that will detect potential sudo hijacking. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Identifies the creation of a sudo binary located at /usr/bin/sudo. Attackers may hijack the default sudo binary and replace it with a custom binary or script that can read the user's password in clear text to escalate privileges or enable persistence onto the system every time the sudo binary is executed.\n\nfile where event.type in (\"creation\", \"file_create_event\") and file.path == \"/usr/bin/sudo\""
+  },
+  {
+    "input": "Generate an EQL query that will detect Tampering of Bash Command-Line History. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Adversaries may attempt to clear or disable the Bash command-line history in an attempt to evade detection or forensic investigations.\n\nprocess where event.type in (\"start\", \"process_started\") and\n (\n  ((process.args : (\"rm\", \"echo\") or\n    (process.args : \"ln\" and process.args : \"-sf\" and process.args : \"/dev/null\") or\n    (process.args : \"truncate\" and process.args : \"-s0\"))\n    and process.args : (\".bash_history\", \"/root/.bash_history\", \"/home/*/.bash_history\",\"/Users/.bash_history\", \"/Users/*/.bash_history\",\n                        \".zsh_history\", \"/root/.zsh_history\", \"/home/*/.zsh_history\", \"/Users/.zsh_history\", \"/Users/*/.zsh_history\")) or\n  (process.name : \"history\" and process.args : \"-c\") or\n  (process.args : \"export\" and process.args : (\"HISTFILE=/dev/null\", \"HISTFILESIZE=0\")) or\n  (process.args : \"unset\" and process.args : \"HISTFILE\") or\n  (process.args : \"set\" and process.args : \"history\" and process.args : \"+o\")\n )"
+  },
+  {
+    "input": "Generate an EQL query that will detect an Attempt to Remove File Quarantine Attribute on macOS. The query should be immediately usable in an Elastic security timeline or detection rule.",
+    "reference": "Identifies a potential Gatekeeper bypass. In macOS, when applications or programs are downloaded from the internet, there is a quarantine flag set on the file. This attribute is read by Apple's Gatekeeper defense program at execution time. An adversary may disable this attribute to evade defenses.\n\n\nprocess where host.os.type == \"macos\" and event.type in (\"start\", \"process_started\") and\n  process.name : \"xattr\" and\n  (\n    (process.args : \"com.apple.quarantine\" and process.args : (\"-d\", \"-w\")) or\n    (process.args : \"-c\") or\n    (process.command_line : (\"/bin/bash -c xattr -c *\", \"/bin/zsh -c xattr -c *\", \"/bin/sh -c xattr -c *\"))\n  ) and not process.args_count > 12"
+  }
+]
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_labs.json
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_labs.json
@ -0,0 +1,38 @@
+[
+  {
+    "input": "What is BLISTER malware?",
+    "reference": "BLISTER, a malware loader initially discovered by Elastic Security Labs in 2021 and associated with financially-motivated intrusions, is a testament to this trend as it continues to develop additional capabilities. Two years after its initial discovery, BLISTER continues to receive updates while flying under the radar, gaining momentum as an emerging threat. Recent findings from Palo Alto’s Unit 42 describe an updated SOCGHOLISH infection chain used to distribute BLISTER and deploy a payload from MYTHIC, an open-source Command and Control (C2) framework.\n\nView the full research on Elastic Security Labs"
+  },
+  {
+    "input": "What detection rules does Elastic Security Labs provide for ICEDID?",
+    "reference": "ICEDID is a malware family first described in 2017 by IBM X-force researchers and is associated with the theft of login credentials, banking information, and other personal information. ICEDID has always been a prevalent family, but has achieved even more growth since EMOTET’s temporary disruption in early 2021. ICEDID has been linked to the distribution of other distinct malware families including DarkVNC and COBALT STRIKE. Regular industry reporting, including research publications like this one, help mitigate this threat.\n\nDetections and preventions\nDetection logic\nEnumeration of Administrator Accounts\nCommand Shell Activity Started via RunDLL32\nSecurity Software Discovery using WMIC\nSuspicious Execution from a Mounted Device\nWindows Network Enumeration\nUnusual DLL Extension Loaded by Rundll32 or Regsvr32\nSuspicious Windows Script Interpreter Child Process\nRunDLL32 with Unusual Arguments\nPreventions (source: https://github.com/elastic/protections-artifacts/)\nMalicious Behavior Detection Alert: Command Shell Activity\nMemory Threat Detection Alert: Shellcode Injection\nMalicious Behavior Detection Alert: Unusual DLL Extension Loaded by Rundll32 or Regsvr32\nMalicious Behavior Detection Alert: Suspicious Windows Script Interpreter Child Process\nMalicious Behavior Detection Alert: RunDLL32 with Unusual Arguments\nMalicious Behavior Detection Alert: Windows Script Execution from Archive File\nYARA\nElastic Security has created multiple YARA rules related to the different stages/components within ICEDID infection, these can be found in the signature linked below: - Windows.Trojan.ICEDID\n\nMore information can be found in this Elastic Security Labs post."
+  },
+  {
+    "input": "Can you provide a code analysis breakdown of the SPECTRALVIPER malware variant?",
+    "reference": "SPECTRALVIPER code analysis\nOverview\nDuring our investigation, we observed a previously-undiscovered backdoor malware family that we’re naming SPECTRALVIPER. SPECTRALVIPER is a 64-bit Windows backdoor coded in C++ and heavily obfuscated. It operates with two distinct communication modes, allowing it to receive messages either via HTTP or a Windows named pipe.\n\nThrough our analysis, we have identified the following capabilities:\n\nPE loading/Injection : SPECTRALVIPER can load and inject executable files, supporting both x86 and x64 architectures. This capability enables it to execute malicious code within legitimate processes.\nToken Impersonation : The malware possesses the ability to impersonate security tokens, granting it elevated privileges and bypassing certain security measures. This enables unauthorized access and manipulation of sensitive resources.\nFile downloading/uploading : SPECTRALVIPER can download and upload files to and from the compromised system. This allows the attacker to exfiltrate data or deliver additional malicious payloads to the infected machine.\nFile/directory manipulation : The backdoor is capable of manipulating files and directories on the compromised system. This includes creating, deleting, modifying, and moving files or directories, providing the attacker with extensive control over the victim's file system.\n\nFor more information, visit the Elastic Security Labs post."
+  },
+  {
+    "input": "Can you describe the threat that PHOREAL malware presents to an organization?",
+    "reference": "PHOREAL/RIZZO is a backdoor allowing initial victim characterization and follow-on post-exploitation operations to compromise the confidentiality of organizations’ data. It has been reported in other research as being used exclusively by APT32 (AKA SeaLotus, OceanLotus, APT-C-00, Group G0050).\n\nFor more information, see this Elastic Security Labs post."
+  },
+  {
+    "input": "Can you give an example of ransomware that attempts to wipe a host master boot record?",
+    "reference": "One such example observed by Elastic Security Labs is WhisperGate, part of the Bleeding Bear malware campaign. Ths SHA256 hash of this sample is a196c6b8ffcb97ffb276d04f354696e2391311db3841ae16c8c9f56f36a38e92. For more information, please view the detailed post on Elastic Security Labs."
+  },
+  {
+    "input": "How does Elastic Security Labs use kernel call stacks to detect threats living in memory?",
+    "reference": "With Elastic Security 8.8, Elastic Security Labs added new kernel call stack based detections which provide improved efficacy against in-memory threats. A call stack is the ordered sequence of functions that are executed to achieve a behavior of a program. It shows in detail which functions (and their associated modules) were executed to lead to a behavior like a new file or process being created. Knowing a behavior’s call stack, we can build detections with detailed contextual information about what a program is doing and how it’s doing it. The new call stack based detection capability leverages our existing deep in-line kernel visibility for the most common system behaviors (process, file, registry, library, etc). With each event, we capture the call stack for the activity. This is later enriched with module information, symbols, and evidence of suspicious activity. This gives Elastic Defend procmon-like visibility in real-time, powering advanced preventions for in-memory tradecraft.\n\nView this post for more information."
+  },
+  {
+    "input": "What were some of Elastic Security Labs key findings in their 2023 Global Threat Report?",
+    "reference": "Impairing defenses by tampering with cloud logging functionality was one of the most common techniques observed in the later part of 2022 and continues into 2023 - This likely impacted visibility of other techniques due to missing data sources, and is potentially a reaction to improvements in cloud logging - XMRig prevalence exploded on MacOS, likely as a result of macroeconomic conditions. The full 2023 Spring report can be found here."
+  },
+  {
+    "input": "Is Elastic Security Labs tracking any malware that targets macOS systems?",
+    "reference": "Yes, one such malware variant is RUSTBUCKET. RUSTBUCKET adds persistence capabilities not previously observed and, at the time of reporting, is undetected by VirusTotal signature engines. Elastic Defend behavioral and prebuilt detection rules provide protection and visibility for users. We have also released a signature to prevent this malware execution.\n\nThe research into REF9135 used host, binary, and network analysis to identify and attribute intrusions observed by this research team, and other intelligence groups, with high confidence to the Lazarus Group; a cybercrime and espionage organization operated by the Democratic People’s Republic of North Korea (DPRK).\n\nView this post for the full research article."
+  },
+  {
+    "input": "How does Elastic Defend help agaist threats like BPFDoor?",
+    "reference": "BPFDoor is a backdoor payload specifically crafted for Linux. Its purpose is for long-term persistence in order to gain re-entry into a previously or actively compromised target environment. It notably utilizes BPF along with a number of other techniques to achieve this goal, taking great care to be as efficient and stealthy as possible. \n\nElastic Security Labs has several detections that can detect and prevent BPDoor. They have also published Yara signatures for it.\n\nThe following Elastic Detection Rules will identify BPFDoor activity:\n\nAbnormal Process ID or Lock File Created\nBinary Executed from Shared Memory Directory\n\nThe yara signature can be found here.\n\nMore details on BPFDoor can be found in this Elastic Security Labs post."
+  }
+]
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_questions_dataset.json
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_questions_dataset.json
@ -0,0 +1,54 @@
+[
+  {
+    "input": "How do I install Elastic Agent to collect events from my windows systems?",
+    "reference": "To install an Elastic Agent and enroll it in Fleet:\n\nIn Kibana, go to Fleet > Agents, and click Add agent.\nIn the Add agent flyout, select an existing agent policy or create a new one. If you create a new policy, Fleet generates a new Fleet enrollment token.\nFor on-premises deployments, you can dedicate a policy to all the agents in the network boundary and configure that policy to include a specific Fleet Server (or a cluster of Fleet Servers).\nMake sure Enroll in Fleet is selected.\nDownload, install, and enroll the Elastic Agent on your host by selecting your host operating system and following the Install Elastic Agent on your host step."
+  },
+  {
+    "input": "How would I run Malware prevention via Elastic Defend on my macOS systems?",
+    "reference": "Elastic Defend malware prevention detects and stops malicious attacks by using a machine learning model that looks for static attributes to determine if a file is malicious or benign. You will need to create a fleet policy with the Elastic Defend integration.\n\nBy default, malware protection is enabled on Windows, macOS, and Linux hosts. To disable malware protection, switch the Malware protections enabled toggle off.\n\nMalware protection levels are:\n\nDetect: Detects malware on the host and generates an alert. The agent will not block malware. You must pay attention to and analyze any malware alerts that are generated.\nPrevent (Default): Detects malware on the host, blocks it from executing, and generates an alert.\nSelect Notify user to send a push notification in the host operating system when activity is detected or prevented. Notifications are enabled by default for the Prevent option.\n\nPlatinum and Enterprise customers can customize these notifications using the Elastic Security {action} {filename} syntax.\nMalware protection also allows you to manage a blocklist to prevent specified applications from running on hosts, extending the list of processes that Elastic Defend considers malicious. Use the Blocklist enabled toggle to enable or disable this feature for all hosts associated with the integration policy. To configure the blocklist, refer to Blocklist.\n\nWhen Prevent is enabled for malware protection, Elastic Defend will quarantine any malicious file it finds. Specifically Elastic Defend will remove the file from its current location, encrypt it with the encryption key ELASTIC, move it to a different folder, and rename it as a GUID string, such as 318e70c2-af9b-4c3a-939d-11410b9a112c.\n\nThe quarantine folder location varies by operating system:\n\nmacOS: /System/Volumes/Data/.equarantine\nLinux: .equarantine at the root of the mount point of the file being quarantined\nWindows - Elastic Defend versions 8.5 and later: [DriveLetter:]\\.quarantine, unless the files are from the C: drive. These files are moved to C:\\Program Files\\Elastic\\Endpoint\\state\\.equarantine.\nWindows - Elastic Defend versions 8.4 and earlier: [DriveLetter:]\\.quarantine, for any drive\nTo restore a quarantined file to its original state and location, add an exception to the rule that identified the file as malicious. If the exception would’ve stopped the rule from identifying the file as malicious, Elastic Defend restores the file.\n\nVisit the documentation for further guidance."
+  },
+  {
+    "input": "How do I build a visualization in Elastic Security to display to top 10 processes with outbound traffic over time?",
+    "reference": "Visualization in Elastic Security are powered by Lens. To build a new Lens visualization, go the the Analytics menu in Kibana, select \"visualize library\" and click on the \"create visualization\" button. Select the \"Lens\" tile. Pick your chart type, and in the Lens layer, \n\nSelect the correct data view that contains your network/process events from the top right. The \"Horizontal axis\" field should contain \"@timestamp\". The vertical axis should contain a metric which is the \"sum of destination.bytes\". The breakdown section should be the \"top 10 values of process.name\". Visit the lens documentation for further information."
+  },
+  {
+    "input": "How do I configure a generated alert to send an e-mail notification to my security operations center in Elastic Security?",
+    "reference": "This is acheived by using alert actions on the Elastic Security detection rule in question.\n\nNavigate to the Alerts page in Elastic Security and click on the \"Manage Rules\" button. Select the detection rule(s) you would like to set up th e-mail action for. Select the connector type (in this case, e-mail), set its action frequency and any additional conditions. Add the body content that you would like to include in any e-mails that are sent. When ready, save the changes to the rule. Visit the documentation for more information."
+  },
+  {
+    "input": "What would be the best way to add an exception for my Elastic Security rules not to trigger when the host of any given event is within a specified list?",
+    "reference": "Value lists hold multiple values of the same Elasticsearch data type, such as IP addresses, which are used to determine when an exception prevents an alert from being generated. You can use value lists to define exceptions for detection rules; however, you cannot use value lists to define endpoint rule exceptions.\n\nAfter creating value lists, you can use is in list and is not in list operators to define exceptions.\n\nTo create a value list:\n\nPrepare a txt or csv file with all the values you want to use for determining exceptions from a single list. If you use a txt file, new lines act as delimiters.\nGo to Manage → Rules.\nClick Import value lists. The Import value lists window opens.\nSelect the list type (Keywords, IP addresses, IP ranges, or Text) from the Type of value list drop-down.\nDrag or select the csv or txt file that contains the values.\nClick Import value list.\n\nFor more information on value lists, please reference the documentation"
+  },
+  {
+    "input": "I need to ingest GCP Audit logs for use within Elastic Security. Please provide instructions on how to do this.",
+    "reference": "There is a pre built Elastic Agent that will allow you to ingest GCP Audit logs easily. Once the audit logs have been configured within the Google cloud console accordingly, proceed to the Fleet management menu in kibana. Go to integrations, and add the \"Google Cloud Platform (GCP) Audit logs\" integration to a new or existing policy. You will need to supply the credentials for the specfic Google Cloud Project, the pub-sub topic and subscription name. Once that is done, the changes will be applied to the hosts that are assigned to the policy you just added the integration to. Reference the documentation for further information about this integration."
+  },
+  {
+    "input": "Please provide instructions on how I can add screenshots to a case within Elastic Security",
+    "reference": "Once you have your screenshots saved as files on your local system, you can upload these via the files tab within an Elastic Security case. You can set file types and sizes by configuring your Kibana case settings.\n\nWhen you add a file, a comment is added to the case activity log. To view an image, click its name in the activity or file list.\n\nImages with supported image types can be rendered within the case simply by clicking on the file name. Reference the documentation for further information."
+  },
+  {
+    "input": "How can I terminate/kill a malicious process that I've identified on one of my hosts via Elastic Defend?",
+    "reference": "Elastic Defend allows you to run response actions via the dedicated response console.\n\nLaunch the response console from any of the following places in Elastic Security:\n\nEndpoints page → Actions menu (…) → Respond\nEndpoint details flyout → Take action → Respond\nAlert details flyout → Take action → Respond\n\nTo perform an action on the endpoint, enter a response action command in the input area at the bottom of the console, then press Return. Output from the action is displayed in the console.\n\nIf a host is unavailable, pending actions will execute once the host comes online. Pending actions expire after two weeks and can be tracked in the response actions history.\n\nActivity in the response console is persistent, so you can navigate away from the page and any pending actions you’ve submitted will continue to run. To confirm that an action completed, return to the response console to view the console output or check the response actions history.\n\nUse the \"kill-process\" command to terminate a process. You must include one of the following parameters to identify the process to terminate:\n\n--pid : A process ID (PID) representing the process to terminate.\n--entityId : An entity ID representing the process to terminate.\nRequired privilege: Process Operations\n\nExample: kill-process --pid 123 --comment \"Terminate suspicious process\"\n\nPlease reference the documentation for more information."
+  },
+  {
+    "input": "I need to adjust the data retention policy for my instance of Elastic Security. Please provide instructions on how to do this.",
+    "reference": "Assuming you are using the default policy for Elastic Agent data streams, you can change the default policy as you require.\n\nTo view the logs policy in Kibana:\n\nOpen the menu and go to Stack Management > Index Lifecycle Policies.\nSelect Include managed system policies.\nSelect the logs policy.\n\nThe default logs policy is designed to prevent the creation of many tiny daily indices. You can modify the policy to meet your performance requirements and manage resource usage.\n\nAs an example, to activate the warm phase, tick the option for the Warm Phase. and click Advanced settings.\n\nSet Move data into phase when to 30 days old. This moves indices to the warm tier 30 days after rollover.\nEnable Set replicas and change Number of replicas to 1.\nEnable Force merge data and set Number of segments to 1.\n\nYou can change each phase of the policy in the same way. For more information, view the documentation."
+  },
+  {
+    "input": "Can you explain what a \"New terms\" rule type is in context of the Elastic Security Detection engine?",
+    "reference": "A new terms rule generates an alert for each new term detected in source documents within a specified time range. You can also detect a combination of up to three new terms (for example, a host.ip and host.id that have never been observed together before). View the documentation for more information about this rule type."
+  },
+  {
+    "input": "What threat intelligence feeds can I use natively within Elastic Security?",
+    "reference": "Elastic Security supports several threat intelligence feeds natively via Elastic Agent integrations. Here is the full list of providers. Each provider can have several different feeds supported by the integration:\n\n- AbuseCH\n- AlienVault OTX\n- Anomali\n- Cyberark\n- Cybersixgill\n- Maltiverse\n- Rapid7 Threat Command\n- Recorded Future\n- Threat Quotient\n\nElastic Agent can also integrate with threat intelligence platforms such as MISP and Collective Intelligence Framework."
+  },
+  {
+    "input": "When a specific alert triggers, I need to collect the security patches that are installed on the system at that point in time. How would I do this within Elastic Security?",
+    "reference": "Elastic Security has native support for OSQuery Management via Elastic Agent. OSQuery can be invoked to run a specified query whenever an alert rule triggers. In this case, OSQuery can be queried for the table within its schema for security patches.\n\nYou can add Osquery Response Actions to new or existing custom query rules. Queries run every time the rule executes.\n\nChoose one of the following:\n\nNew rule: When you are on the last step of custom query rule creation, go to the Response Actions section and click the Osquery icon.\nExisting rule: Edit the rule’s settings, then go to the Actions tab. In the tab, click the Osquery icon under the Response Actions section.\n\n\nSpecify whether you want to set up a single live query or a pack:\n\nQuery: Select a saved query or enter a new one. After you enter the query, you can expand the Advanced section to view or set mapped ECS fields included in the results from the live query. Mapping ECS fields is optional.\n\nYou can use placeholder fields to dynamically add alert data to your query.\nPack: Select from available query packs. After you select a pack, all of the queries in the pack are displayed.\n\nClick the Osquery icon to add more live queries (optional).\nClick Create & enable rule (for a new rule) or Save changes (for existing rules) to finish adding the queries.\n\nIn this case, the query in question will be - \"\"select * from patches where description == \"\"Security Update\"\"\"\"\n\nFor more information about running OSQuery responses, view the documentation."
+  },
+  {
+    "input": "Where would I raise an issue/request for Elastic Security detections and preventions?",
+    "reference": "Issues and requests for Elastic Security detections can be raised in https://github.com/elastic/detection-rules. The repository for protections is https://github.com/elastic/protections-artifacts."
+  }
+]
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts
@ -0,0 +1,146 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { loadEvaluator } from 'langchain/evaluation';
+import { LLM } from 'langchain/llms/base';
+import { ChainValues, HumanMessage } from 'langchain/schema';
+import { chunk } from 'lodash/fp';
+import { Logger } from '@kbn/core/server';
+import { ToolingLog } from '@kbn/tooling-log';
+import { asyncForEach } from '@kbn/std';
+import { AgentExecutorEvaluator } from '../langchain/executors/types';
+import { Dataset } from '../../schemas/evaluate/post_evaluate';
+import { callAgentWithRetry, getMessageFromLangChainResponse, wait } from './utils';
+import { ResponseBody } from '../langchain/types';
+
+export interface PerformEvaluationParams {
+  agentExecutorEvaluators: AgentExecutorEvaluator[];
+  dataset: Dataset;
+  evaluationId: string;
+  evaluatorModel: LLM;
+  evaluationPrompt?: string;
+  evaluationType: string;
+  maxConcurrency?: number;
+  logger: Logger | ToolingLog;
+}
+
+export interface EvaluationResult {
+  '@timestamp': string;
+  evaluation: ChainValues;
+  evaluationId: string;
+  input: string;
+  prediction: string;
+  predictionResponse: PromiseSettledResult<ResponseBody>;
+  reference: string;
+}
+
+export interface EvaluationSummary {
+  '@timestamp': string;
+  evaluationStart: number;
+  evaluationEnd: number;
+  evaluationId: string;
+  evaluationDuration: number;
+  totalAgents: number;
+  totalRequests: number;
+  totalInput: number;
+}
+
+/**
+ * Evaluates a dataset based on an evaluation rubric. Takes dataset of input/reference pairs,
+ * and fetches the output (prediction) of the input against the provided agent executors.
+ * Then evaluates all three using the provided evaluation rubric.
+ */
+export const performEvaluation = async ({
+  agentExecutorEvaluators,
+  dataset,
+  evaluationId,
+  evaluatorModel,
+  evaluationPrompt,
+  evaluationType,
+  maxConcurrency = 3,
+  logger,
+}: PerformEvaluationParams) => {
+  const startTime = new Date().getTime();
+  const evaluationResults: EvaluationResult[] = [];
+
+  const predictionRequests = dataset.flatMap(({ input, reference }) =>
+    agentExecutorEvaluators.map((agent) => ({
+      input,
+      reference,
+      request: callAgentWithRetry({ agent, messages: [new HumanMessage(input)], logger }),
+    }))
+  );
+
+  logger.info(`Total prediction requests: ${predictionRequests.length}`);
+  logger.info(`Chunk size: ${maxConcurrency}`);
+  logger.info('Fetching predictions...');
+  const requestChunks = chunk(maxConcurrency, predictionRequests);
+  await asyncForEach(requestChunks, async (c, i) => {
+    logger.info(`Prediction request chunk: ${i + 1} of ${requestChunks.length}`);
+
+    // Note, order is kept between chunk and dataset, and is preserved w/ Promise.allSettled
+    const chunkResults = await Promise.allSettled(c.map((r) => r.request));
+    logger.info(`Prediction request chunk ${i + 1} response:\n${JSON.stringify(chunkResults)}`);
+    chunkResults.forEach((response, chunkResultIndex) =>
+      evaluationResults.push({
+        '@timestamp': new Date().toISOString(),
+        input: c[chunkResultIndex].input,
+        reference: c[chunkResultIndex].reference,
+        evaluationId,
+        evaluation: {},
+        prediction: getMessageFromLangChainResponse(response),
+        predictionResponse: response,
+      })
+    );
+  });
+
+  logger.info(`Prediction results:\n${JSON.stringify(evaluationResults)}`);
+
+  logger.info('Performing evaluation....');
+  logger.info(`Evaluation model: ${evaluatorModel._llmType()}`);
+
+  if (evaluationType === 'correctness') {
+    logger.info('Evaluation type: correctness');
+    const evaluator = await loadEvaluator('labeled_criteria', {
+      criteria: 'correctness',
+      llm: evaluatorModel,
+    });
+    await asyncForEach(evaluationResults, async ({ input, prediction, reference }, index) => {
+      // TODO: Rate limit evaluator calls, though haven't seen any `429`'s yet in testing datasets up to 10 w/ azure/bedrock
+      const evaluation = await evaluator.evaluateStrings({
+        input,
+        prediction,
+        reference,
+      });
+      evaluationResults[index].evaluation = evaluation;
+      await wait(1000);
+    });
+  } else if (evaluationType === 'esql-validator') {
+    logger.info('Evaluation type: esql-validator');
+    // TODO: Implement esql-validator here
+  } else if (evaluationType === 'custom') {
+    logger.info('Evaluation type: custom');
+    // TODO: Implement custom evaluation here
+  }
+
+  const endTime = new Date().getTime();
+
+  const evaluationSummary: EvaluationSummary = {
+    evaluationId,
+    '@timestamp': new Date().toISOString(),
+    evaluationStart: startTime,
+    evaluationEnd: endTime,
+    evaluationDuration: endTime - startTime,
+    totalAgents: agentExecutorEvaluators.length,
+    totalInput: dataset.length,
+    totalRequests: predictionRequests.length,
+  };
+
+  logger.info(`Final results:\n${JSON.stringify(evaluationResults, null, 2)}`);
+
+  return { evaluationResults, evaluationSummary };
+};
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/mappings.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/mappings.ts
@ -0,0 +1,115 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
+
+export const evaluationIndexMappings: MappingTypeMapping = {
+  properties: {
+    '@timestamp': {
+      type: 'date',
+    },
+    evaluation: {
+      properties: {
+        reasoning: {
+          type: 'text',
+          fields: {
+            keyword: {
+              type: 'keyword',
+              ignore_above: 1024,
+            },
+          },
+        },
+        score: {
+          type: 'long',
+        },
+        value: {
+          type: 'text',
+        },
+      },
+    },
+    evaluationId: {
+      type: 'text',
+      fields: {
+        keyword: {
+          type: 'keyword',
+          ignore_above: 1024,
+        },
+      },
+    },
+    evaluationStart: {
+      type: 'long',
+    },
+    evaluationEnd: {
+      type: 'long',
+    },
+    evaluationDuration: {
+      type: 'long',
+    },
+    input: {
+      type: 'text',
+      fields: {
+        keyword: {
+          type: 'keyword',
+          ignore_above: 1024,
+        },
+      },
+    },
+    prediction: {
+      type: 'text',
+      fields: {
+        keyword: {
+          type: 'keyword',
+          ignore_above: 1024,
+        },
+      },
+    },
+    predictionResponse: {
+      properties: {
+        status: {
+          type: 'text',
+        },
+        value: {
+          properties: {
+            connector_id: {
+              type: 'text',
+            },
+            data: {
+              type: 'text',
+              fields: {
+                keyword: {
+                  type: 'keyword',
+                  ignore_above: 1024,
+                },
+              },
+            },
+            status: {
+              type: 'text',
+            },
+          },
+        },
+      },
+    },
+    reference: {
+      type: 'text',
+      fields: {
+        keyword: {
+          type: 'keyword',
+          ignore_above: 1024,
+        },
+      },
+    },
+    totalAgents: {
+      type: 'long',
+    },
+    totalInput: {
+      type: 'long',
+    },
+    totalRequests: {
+      type: 'long',
+    },
+  },
+};
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/utils.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/utils.ts
@ -0,0 +1,104 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
+import { Logger } from '@kbn/logging';
+import { ToolingLog } from '@kbn/tooling-log';
+import { evaluationIndexMappings as mappings } from './mappings';
+import { EvaluationResult, EvaluationSummary } from '../evaluation';
+
+interface SetupIndexParams {
+  esClient: ElasticsearchClient;
+  index: string;
+  logger: Logger | ToolingLog;
+}
+
+/**
+ * Sets up the output index for the model evaluator. Creates index with mappings
+ * if not already exists
+ *
+ * @param {Object} options - The options object.
+ * @param {ElasticsearchClient} options.esClient Elasticsearch client
+ * @param {string} options.index Name of the output index
+ *
+ * @returns {Promise<boolean>} True if index exists or created successfully
+ */
+export const setupEvaluationIndex = async ({
+  esClient,
+  index,
+  logger,
+}: SetupIndexParams): Promise<boolean> => {
+  // Check if index exists
+  const indexExists = await esClient.indices.exists({ index });
+  if (indexExists) {
+    logger.info(`Index "${index}" already exists`);
+    return true;
+  }
+
+  // Create index with default eval mappings if not exists
+  const settings = {};
+  const response = await esClient.indices.create({
+    index,
+    mappings,
+    settings,
+  });
+
+  if (response.acknowledged) {
+    logger.info(`Created index "${index}"`);
+  } else {
+    logger.error(`Error creating index "${index}"`);
+  }
+
+  return response.acknowledged;
+};
+
+interface IndexEvaluationsParams {
+  esClient: ElasticsearchClient;
+  evaluationResults: EvaluationResult[];
+  evaluationSummary: EvaluationSummary;
+  index: string;
+  logger: Logger | ToolingLog;
+}
+
+/**
+ * Indexes evaluation results into the output index
+ * @param {Object} options - The options object.
+ * @param {ElasticsearchClient} options.esClient Elasticsearch client
+ * @param {EvaluationResult[]} options.evaluationResults Individual eval results
+ * @param {EvaluationResult[]} options.evaluationSummary Summary of eval
+ * @param {string} options.index Name of the output index
+ *
+ * @returns {Promise<boolean>} True if documents created successfully
+ */
+export const indexEvaluations = async ({
+  esClient,
+  evaluationResults,
+  evaluationSummary,
+  index,
+  logger,
+}: IndexEvaluationsParams): Promise<boolean> => {
+  try {
+    const response = await esClient.helpers.bulk({
+      datasource: evaluationResults,
+      onDocument(doc) {
+        return { index: { _index: index } };
+      },
+    });
+
+    logger.info(`Writing evaluations...`);
+    logger.info(`Evaluations bulk index response:\n${JSON.stringify(response)}`);
+
+    logger.info(`Writing summary...`);
+    const summaryResponse = await esClient.index({ index, document: evaluationSummary });
+    logger.info(`Summary index response:\n${JSON.stringify(summaryResponse)}`);
+
+    return true;
+  } catch (e) {
+    logger.error('Error indexing data into the evaluation index', e);
+    return false;
+  }
+};
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts
@ -0,0 +1,110 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { Logger } from '@kbn/logging';
+import { ToolingLog } from '@kbn/tooling-log';
+import { BaseMessage } from 'langchain/schema';
+import { ResponseBody } from '../langchain/types';
+import { AgentExecutorEvaluator } from '../langchain/executors/types';
+
+export const wait = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
+
+export interface CallAgentWithRetryParams {
+  agent: AgentExecutorEvaluator;
+  messages: BaseMessage[];
+  logger: Logger | ToolingLog;
+  maxRetries?: number;
+}
+export const callAgentWithRetry = async ({
+  agent,
+  messages,
+  logger,
+  maxRetries = 3,
+}: CallAgentWithRetryParams) => {
+  for (let attempt = 0; attempt < maxRetries; attempt++) {
+    try {
+      return await agent(messages);
+    } catch (error) {
+      // Check for 429, and then if there is a retry-after header
+      const { isRateLimitError, retryAfter } = parseErrorMessage(error);
+      if (isRateLimitError) {
+        logger.error(
+          "callAgentWithRetry: Slow down! You're going too fast! 429 detected! Retrying after..."
+        );
+        if (retryAfter != null) {
+          logger.error(`${retryAfter} seconds`);
+          await wait(retryAfter * 1000);
+          // eslint-disable-next-line no-continue
+          continue;
+        }
+      }
+      // If not 429 or there is no retry-after header, reject the promise
+      logger.error(`Error calling agent:\n${error}`);
+      return Promise.reject(error);
+    }
+  }
+  logger.error(`callAgentWithRetry: Max retries reached: ${maxRetries}`);
+  // Reject and keep going!
+  // eslint-disable-next-line prefer-promise-reject-errors
+  return Promise.reject(`callAgentWithRetry: Max retries reached: ${maxRetries}`);
+};
+
+export const getMessageFromLangChainResponse = (
+  response: PromiseSettledResult<ResponseBody>
+): string => {
+  if (response.status === 'fulfilled' && response.value.data != null) {
+    return getFormattedMessageContent(response.value.data);
+  }
+  return 'error';
+};
+
+/**
+ * Lifted from `x-pack/packages/kbn-elastic-assistant/impl/assistant/helpers.ts`
+ * TODO: Move this to a shared location
+ *
+ * When `content` is a JSON string, prefixed with "```json\n"
+ * and suffixed with "\n```", this function will attempt to parse it and return
+ * the `action_input` property if it exists.
+ */
+export const getFormattedMessageContent = (content: string): string => {
+  const formattedContentMatch = content.match(/```json\n([\s\S]+)\n```/);
+
+  if (formattedContentMatch) {
+    try {
+      const parsedContent = JSON.parse(formattedContentMatch[1]);
+
+      return parsedContent.action_input ?? content;
+    } catch {
+      // we don't want to throw an error here, so we'll fall back to the original content
+    }
+  }
+
+  return content;
+};
+
+/**
+ * Parse an error message coming back from the agent via the actions frameworks to determine if it is
+ * a rate limit error and extract the retry after delay.
+ *
+ * Note: Could be simplified by instrumenting agents w/ callback where there's access to the actual response
+ * @param error
+ */
+export const parseErrorMessage = (
+  error: Error
+): { isRateLimitError: boolean; retryAfter: number | null } => {
+  const errorMessage: string = error.message;
+
+  const rateLimitRegex = /Status code: 429.*?Please retry after (\d+) seconds/;
+  const match = errorMessage.match(rateLimitRegex);
+
+  // If there is a match, return the parsed delay; otherwise, return an indication that it is not a 429 error.
+  if (match && match[1]) {
+    return { isRateLimitError: true, retryAfter: parseInt(match[1], 10) };
+  } else {
+    return { isRateLimitError: false, retryAfter: null };
+  }
+};
--- a/x-pack/plugins/elastic_assistant/server/plugin.ts
+++ b/x-pack/plugins/elastic_assistant/server/plugin.ts
@ -25,6 +25,7 @@ import {
  deleteKnowledgeBaseRoute,
  getKnowledgeBaseStatusRoute,
  postActionsConnectorExecuteRoute,
+  postEvaluateRoute,
  postKnowledgeBaseRoute,
 } from './routes';

@ -72,10 +73,14 @@ export class ElasticAssistantPlugin
      )
    );

+    // Knowledge Base
    deleteKnowledgeBaseRoute(router);
    getKnowledgeBaseStatusRoute(router);
    postKnowledgeBaseRoute(router);
+    // Actions Connector Execute (LLM Wrapper)
    postActionsConnectorExecuteRoute(router);
+    // Evaluate
+    postEvaluateRoute(router);
    return {
      actions: plugins.actions,
    };
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
@ -0,0 +1,170 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { IRouter, KibanaRequest, Logger } from '@kbn/core/server';
+import { transformError } from '@kbn/securitysolution-es-utils';
+
+import { v4 as uuidv4 } from 'uuid';
+import { buildResponse } from '../../lib/build_response';
+import { buildRouteValidation } from '../../schemas/common';
+
+import { ElasticAssistantRequestHandlerContext } from '../../types';
+import { EVALUATE } from '../../../common/constants';
+import { PostEvaluateBody, PostEvaluatePathQuery } from '../../schemas/evaluate/post_evaluate';
+import { performEvaluation } from '../../lib/model_evaluator/evaluation';
+import { callAgentExecutor } from '../../lib/langchain/execute_custom_llm_chain';
+import { callOpenAIFunctionsExecutor } from '../../lib/langchain/executors/openai_functions_executor';
+import { AgentExecutor, AgentExecutorEvaluator } from '../../lib/langchain/executors/types';
+import { ActionsClientLlm } from '../../lib/langchain/llm/actions_client_llm';
+import {
+  indexEvaluations,
+  setupEvaluationIndex,
+} from '../../lib/model_evaluator/output_index/utils';
+import { getLlmType } from './utils';
+import { RequestBody } from '../../lib/langchain/types';
+
+/**
+ * To support additional Agent Executors from the UI, add them to this map
+ * and reference your specific AgentExecutor function
+ */
+const AGENT_EXECUTOR_MAP: Record<string, AgentExecutor> = {
+  DefaultAgentExecutor: callAgentExecutor,
+  OpenAIFunctionsExecutor: callOpenAIFunctionsExecutor,
+};
+
+export const postEvaluateRoute = (router: IRouter<ElasticAssistantRequestHandlerContext>) => {
+  router.post(
+    {
+      path: EVALUATE,
+      validate: {
+        body: buildRouteValidation(PostEvaluateBody),
+        query: buildRouteValidation(PostEvaluatePathQuery),
+      },
+    },
+    async (context, request, response) => {
+      // TODO: Limit route based on experimental feature
+      const resp = buildResponse(response);
+      const logger: Logger = (await context.elasticAssistant).logger;
+
+      const { evalModel, evaluationType, outputIndex } = request.query;
+      const { dataset, evalPrompt } = request.body;
+      const connectorIds = request.query.models?.split(',') || [];
+      const agentNames = request.query.agents?.split(',') || [];
+
+      const evaluationId = uuidv4();
+
+      logger.info('postEvaluateRoute:');
+      logger.info(`request.query:\n${JSON.stringify(request.query, null, 2)}`);
+      logger.info(`request.body:\n${JSON.stringify(request.body, null, 2)}`);
+      logger.info(`Evaluation ID: ${evaluationId}`);
+
+      const totalExecutions = connectorIds.length * agentNames.length * dataset.length;
+      logger.info('Creating agents:');
+      logger.info(`\tconnectors/models: ${connectorIds.length}`);
+      logger.info(`\tagents: ${agentNames.length}`);
+      logger.info(`\tdataset: ${dataset.length}`);
+      logger.warn(`\ttotal baseline agent executions: ${totalExecutions} `);
+      if (totalExecutions > 50) {
+        logger.warn(
+          `Total baseline agent executions >= 50! This may take a while, and cost some money...`
+        );
+      }
+
+      try {
+        // Get the actions plugin start contract from the request context for the agents
+        const actions = (await context.elasticAssistant).actions;
+
+        // Fetch all connectors from the actions plugin, so we can set the appropriate `llmType` on ActionsClientLlm
+        const actionsClient = await actions.getActionsClientWithRequest(request);
+        const connectors = await actionsClient.getBulk({
+          ids: connectorIds,
+          throwIfSystemAction: false,
+        });
+
+        // Get a scoped esClient for passing to the agents for retrieval, and
+        // writing results to the output index
+        const esClient = (await context.core).elasticsearch.client.asCurrentUser;
+
+        // Skeleton request to satisfy `subActionParams` spread in `ActionsClientLlm`
+        const skeletonRequest: KibanaRequest<unknown, unknown, RequestBody> = {
+          ...request,
+          body: {
+            params: {
+              subAction: 'invokeAI',
+              subActionParams: {
+                messages: [],
+              },
+            },
+          },
+        };
+
+        // Create an array of executor functions to call in batches
+        // One for each connector/model + agent combination
+        // Hoist `langChainMessages` so they can be batched by dataset.input in the evaluator
+        const agents: AgentExecutorEvaluator[] = [];
+        connectorIds.forEach((connectorId) => {
+          agentNames.forEach((agentName) => {
+            logger.info(`Creating agent: ${connectorId} + ${agentName}`);
+            const llmType = getLlmType(connectorId, connectors);
+            agents.push((langChainMessages) =>
+              AGENT_EXECUTOR_MAP[agentName]({
+                actions,
+                connectorId,
+                esClient,
+                langChainMessages,
+                llmType,
+                logger,
+                request: skeletonRequest,
+              })
+            );
+          });
+        });
+
+        logger.info(`Agents created: ${agents.length}`);
+
+        const evaluatorModel = new ActionsClientLlm({
+          actions,
+          connectorId: evalModel,
+          request: skeletonRequest,
+          logger,
+        });
+
+        const { evaluationResults, evaluationSummary } = await performEvaluation({
+          agentExecutorEvaluators: agents,
+          dataset,
+          evaluationId,
+          evaluatorModel,
+          evaluationPrompt: evalPrompt,
+          evaluationType,
+          logger,
+        });
+
+        logger.info(`Writing evaluation results to index: ${outputIndex}`);
+        await setupEvaluationIndex({ esClient, index: outputIndex, logger });
+        await indexEvaluations({
+          esClient,
+          evaluationResults,
+          evaluationSummary,
+          index: outputIndex,
+          logger,
+        });
+
+        return response.ok({
+          body: { success: true },
+        });
+      } catch (err) {
+        logger.error(err);
+        const error = transformError(err);
+
+        return resp.error({
+          body: error.message,
+          statusCode: error.statusCode,
+        });
+      }
+    }
+  );
+};
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
@ -0,0 +1,31 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { OpenAiProviderType } from '@kbn/stack-connectors-plugin/common/gen_ai/constants';
+import type { ActionResult } from '@kbn/actions-plugin/server';
+
+/**
+ * Returns the LangChain `llmType` for the given connectorId/connectors
+ *
+ * @param connectorId
+ * @param connectors
+ */
+export const getLlmType = (connectorId: string, connectors: ActionResult[]): string | undefined => {
+  const connector = connectors.find((c) => c.id === connectorId);
+  // Note: Pre-configured connectors do not have an accessible `apiProvider` field
+  const apiProvider = (connector?.config?.apiProvider as string) ?? undefined;
+
+  if (apiProvider === OpenAiProviderType.OpenAi) {
+    // See: https://github.com/langchain-ai/langchainjs/blob/fb699647a310c620140842776f4a7432c53e02fa/langchain/src/agents/openai/index.ts#L185
+    return 'openai';
+  }
+  // TODO: Add support for AWS Bedrock Connector once merged
+  // Note: Doesn't appear to be a difference between Azure and OpenAI LLM types, so TBD for functions agent on Azure
+  // See: https://github.com/langchain-ai/langchainjs/blob/fb699647a310c620140842776f4a7432c53e02fa/langchain/src/llms/openai.ts#L539
+
+  return undefined;
+};
--- a/x-pack/plugins/elastic_assistant/server/routes/index.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/index.ts
@ -5,7 +5,13 @@
 * 2.0.
 */

+// Actions Connector Execute (LLM Wrapper)
+export { postActionsConnectorExecuteRoute } from './post_actions_connector_execute';
+
+// Knowledge Base
 export { deleteKnowledgeBaseRoute } from './knowledge_base/delete_knowledge_base';
 export { getKnowledgeBaseStatusRoute } from './knowledge_base/get_knowledge_base_status';
-export { postActionsConnectorExecuteRoute } from './post_actions_connector_execute';
 export { postKnowledgeBaseRoute } from './knowledge_base/post_knowledge_base';
+
+// Evaluate
+export { postEvaluateRoute } from './evaluate/post_evaluate';
--- a/x-pack/plugins/elastic_assistant/server/routes/knowledge_base/constants.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/knowledge_base/constants.ts
@ -7,6 +7,8 @@

 // Note: using default ELSER model ID so when setup by user in UI, all defaults can be accepted and everything works
 export const ELSER_MODEL_ID = '.elser_model_1';
+export const MODEL_EVALUATION_RESULTS_INDEX_PATTERN =
+  '.kibana-elastic-ai-assistant-evaluation-results';
 export const KNOWLEDGE_BASE_INDEX_PATTERN = '.kibana-elastic-ai-assistant-kb';
 export const KNOWLEDGE_BASE_INGEST_PIPELINE = '.kibana-elastic-ai-assistant-kb-ingest-pipeline';
 // Query for determining if ESQL docs have been loaded, searches for a specific doc. Intended for the ElasticsearchStore.similaritySearch()
--- a/x-pack/plugins/elastic_assistant/server/schemas/evaluate/post_evaluate.ts
+++ b/x-pack/plugins/elastic_assistant/server/schemas/evaluate/post_evaluate.ts
@ -0,0 +1,51 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import * as t from 'io-ts';
+
+/** Validates Output Index starts with `.kibana-elastic-ai-assistant-` */
+const outputIndex = new t.Type<string, string, unknown>(
+  'OutputIndexPrefixed',
+  (input): input is string =>
+    typeof input === 'string' && input.startsWith('.kibana-elastic-ai-assistant-'),
+  (input, context) =>
+    typeof input === 'string' && input.startsWith('.kibana-elastic-ai-assistant-')
+      ? t.success(input)
+      : t.failure(
+          input,
+          context,
+          `Type error: Output Index does not start with '.kibana-elastic-ai-assistant-'`
+        ),
+  t.identity
+);
+
+/** Validates the URL path of a POST request to the `/evaluate` endpoint */
+export const PostEvaluatePathQuery = t.type({
+  agents: t.string,
+  evaluationType: t.string,
+  evalModel: t.string,
+  outputIndex,
+  models: t.string,
+});
+
+export type DatasetItem = t.TypeOf<typeof DatasetItem>;
+export const DatasetItem = t.type({
+  input: t.string,
+  reference: t.string,
+  prediction: t.union([t.string, t.undefined]),
+});
+
+export type Dataset = t.TypeOf<typeof Dataset>;
+export const Dataset = t.array(DatasetItem);
+
+/** Validates the body of a POST request to the `/evaluate` endpoint */
+export const PostEvaluateBody = t.type({
+  dataset: Dataset,
+  evalPrompt: t.union([t.string, t.undefined]),
+});
+
+export type PostEvaluateBodyInputs = t.TypeOf<typeof PostEvaluateBody>;
--- a/x-pack/plugins/elastic_assistant/tsconfig.json
+++ b/x-pack/plugins/elastic_assistant/tsconfig.json
@ -7,6 +7,7 @@
    "common/**/*",
    "server/lib/**/*",
    "server/**/*",
+    "scripts/**/*.ts",
    // must declare *.json explicitly per https://github.com/microsoft/TypeScript/issues/25636
    "server/**/*.json",
    "../../../typings/**/*"
@ -25,6 +26,11 @@
    "@kbn/core-logging-server-mocks",
    "@kbn/utility-types-jest",
    "@kbn/utility-types",
+    "@kbn/tooling-log",
+    "@kbn/core-elasticsearch-server",
+    "@kbn/logging",
+    "@kbn/std",
+    "@kbn/stack-connectors-plugin",
  ],
  "exclude": [
    "target/**/*",
--- a/x-pack/plugins/security_solution/common/experimental_features.ts
+++ b/x-pack/plugins/security_solution/common/experimental_features.ts
@ -83,6 +83,11 @@ export const allowedExperimentalValues = Object.freeze({
   **/
  alertsPageFiltersEnabled: true,

+  /**
+   * Enables the Assistant Model Evaluation advanced setting and API endpoint, introduced in `8.11.0`.
+   */
+  assistantModelEvaluation: false,
+
  /*
   * Enables the new user details flyout displayed on the Alerts page and timeline.
   *
--- a/x-pack/plugins/security_solution/public/assistant/provider.tsx
+++ b/x-pack/plugins/security_solution/public/assistant/provider.tsx
@ -7,7 +7,7 @@
 import React, { useCallback } from 'react';
 import { i18n } from '@kbn/i18n';
 import { AssistantProvider as ElasticAssistantProvider } from '@kbn/elastic-assistant';
-import { useKibana } from '../common/lib/kibana';
+import { useBasePath, useKibana } from '../common/lib/kibana';
 import { useAssistantTelemetry } from './use_assistant_telemetry';
 import { getComments } from './get_comments';
 import { augmentMessageCodeBlocks, LOCAL_STORAGE_KEY } from './helpers';
@ -19,6 +19,7 @@ import { BASE_SECURITY_SYSTEM_PROMPTS } from './content/prompts/system';
 import { useAnonymizationStore } from './use_anonymization_store';
 import { useAssistantAvailability } from './use_assistant_availability';
 import { APP_ID } from '../../common/constants';
+import { useIsExperimentalFeatureEnabled } from '../common/hooks/use_experimental_features';

 const ASSISTANT_TITLE = i18n.translate('xpack.securitySolution.assistant.title', {
  defaultMessage: 'Elastic AI Assistant',
@ -33,6 +34,8 @@ export const AssistantProvider: React.FC = ({ children }) => {
    triggersActionsUi: { actionTypeRegistry },
    docLinks: { ELASTIC_WEBSITE_URL, DOC_LINK_VERSION },
  } = useKibana().services;
+  const basePath = useBasePath();
+  const isModelEvaluationEnabled = useIsExperimentalFeatureEnabled('assistantModelEvaluation');

  const { conversations, setConversations } = useConversationStore();
  const getInitialConversation = useCallback(() => {
@ -52,13 +55,16 @@ export const AssistantProvider: React.FC = ({ children }) => {
      actionTypeRegistry={actionTypeRegistry}
      augmentMessageCodeBlocks={augmentMessageCodeBlocks}
      assistantAvailability={assistantAvailability}
-      assistantLangChain={false}
+      // NOTE: `assistantLangChain` and `assistantModelEvaluation` experimental feature will be coupled until upcoming
+      // Knowledge Base UI updates, which will remove the `assistantLangChain` feature flag in favor of a UI feature toggle
+      assistantLangChain={isModelEvaluationEnabled}
      assistantTelemetry={assistantTelemetry}
      defaultAllow={defaultAllow}
      defaultAllowReplacement={defaultAllowReplacement}
      docLinks={{ ELASTIC_WEBSITE_URL, DOC_LINK_VERSION }}
      baseAllow={DEFAULT_ALLOW}
      baseAllowReplacement={DEFAULT_ALLOW_REPLACEMENT}
+      basePath={basePath}
      basePromptContexts={Object.values(PROMPT_CONTEXTS)}
      baseQuickPrompts={BASE_SECURITY_QUICK_PROMPTS}
      baseSystemPrompts={BASE_SECURITY_SYSTEM_PROMPTS}
--- a/x-pack/plugins/security_solution/public/common/mock/mock_assistant_provider.tsx
+++ b/x-pack/plugins/security_solution/public/common/mock/mock_assistant_provider.tsx
@ -38,6 +38,7 @@ export const MockAssistantProviderComponent: React.FC<Props> = ({ children }) =>
      augmentMessageCodeBlocks={jest.fn(() => [])}
      baseAllow={[]}
      baseAllowReplacement={[]}
+      basePath={'https://localhost:5601/kbn'}
      defaultAllow={[]}
      docLinks={{
        ELASTIC_WEBSITE_URL: 'https://www.elastic.co/',