[Security Solution] [Elastic AI Assistant] Adds APM instrumentation and LangSmith test data integration (#171153)

## Summary This PR instruments the Elastic AI Assistant with the Kibana APM Agent enabling the tracing of retrievers, llms, chains, and tools which can then be viewed within the Observability app. This PR also improves the Assistant Model Evaluation tooling by enabling support for pulling and running test datasets from LangSmith. If the `assistantModelEvaluation` experimental feature flag is enabled, and an APM server is configured, messages that have a corresponding trace will have an additional `View APM trace` action: <p align="center"> <img width="500" src="e0b372ee-139a-4eed-8b09-f01dd88c72b0" /> </p> Viewing the trace you can see a breakdown of the time spent in each retriever, llm, chain, and tool: <p align="center"> <img width="500" src="f7cbd4bc-207c-4c88-a032-70a8de4f9b9a" /> </p> Additionally the Evaluation interface has been updated to support adding additional metadata like `Project Name`, `Run Name`, and pulling test datasets from LangSmith. Predictions can now also be run without having to run an Evaluation, so datasets can quickly be run for manual analysis. <p align="center"> <img width="500" src="acebf719-29fd-4fcc-aef1-99fd00ca800a" /> </p> <p align="center"> <img width="500" src="7081d993-cbe0-4465-a734-ff9be14d7d0d" /> </p> ## Testing ### Configuring APM First, enable the `assistantModelEvaluation` experimental feature flag by adding the following to your `kibana.dev.yml`: ``` xpack.securitySolution.enableExperimental: [ 'assistantModelEvaluation' ] ``` Next, you'll need an APM server to collect the traces. You can either [follow the documentation for installing](https://www.elastic.co/guide/en/apm/guide/current/installing.html) the released artifact, or [run from source](https://github.com/elastic/apm-server#apm-server-development) and set up using the [quickstart guide provided](https://www.elastic.co/guide/en/apm/guide/current/apm-quick-start.html) (be sure to install the APM Server integration to ensure the necessary indices are created!). Once your APM server is running, add your APM server configuration to your `kibana.dev.yml` as well using the following: ``` # APM elastic.apm: active: true environment: 'SpongBox5002c™' serverUrl: 'http://localhost:8200' transactionSampleRate: 1.0 breakdownMetrics: true spanStackTraceMinDuration: 10ms # Disables Kibana RUM servicesOverrides.kibana-frontend.active: false ``` > [!NOTE] > If connecting to a cloud APM server (like our [ai-assistant apm deployment](https://ai-assistant-apm-do-not-delete.kb.us-central1.gcp.cloud.es.io/)), follow [these steps](https://www.elastic.co/guide/en/apm/guide/current/api-key.html#create-an-api-key) to create an API key, and then set it via `apiKey` and also set your `serverUrl` as shown in the APM Integration details within fleet. Note that the `View APM trace` button within the UI will link to your local instance, not the cloud instance. > [!NOTE] > If you're an Elastic developer running Kibana from source, you can just enable APM as above, and _not_ include a `serverUrl`, and your traces will be sent to the https://kibana-cloud-apm.elastic.dev cluster. Note that the `View APM trace` button within the UI will link to your local instance, not the cloud instance. ### Configuring LangSmith If wanting to push traces to LangSmith, or leverage any datasets that you may have hosted in a project, all you need to do is configure a few environment variables, and then start the kibana server. See the [LangSmith Traces documentation](https://docs.smith.langchain.com/tracing) for details, or just add the below env variables to enable: ``` # LangChain LangSmith export LANGCHAIN_TRACING_V2=true export LANGCHAIN_ENDPOINT="https://api.smith.langchain.com" export LANGCHAIN_API_KEY="" export LANGCHAIN_PROJECT="8.12 ESQL Query Generation" ``` --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
2025-04-23 17:28:26 -04:00 · 2023-11-27 21:01:14 -07:00 · 2023-11-27 21:01:14 -07:00 · 82173b5c6e
commit 82173b5c6e
parent 4e8385a455
30 changed files with 1339 additions and 287 deletions
--- a/package.json
+++ b/package.json
@ -958,7 +958,8 @@
    "jsonwebtoken": "^9.0.0",
    "jsts": "^1.6.2",
    "kea": "^2.6.0",
-    "langchain": "^0.0.151",
+    "langchain": "^0.0.186",
+    "langsmith": "^0.0.48",
    "launchdarkly-js-client-sdk": "^3.1.4",
    "launchdarkly-node-server-sdk": "^7.0.3",
    "load-json-file": "^6.2.0",
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api.test.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api.test.tsx
@ -319,6 +319,9 @@ describe('API tests', () => {
        evalParams: {
          agents: ['not', 'alphabetical'],
          dataset: '{}',
+          datasetName: 'Test Dataset',
+          projectName: 'Test Project Name',
+          runName: 'Test Run Name',
          evalModel: ['not', 'alphabetical'],
          evalPrompt: 'evalPrompt',
          evaluationType: ['not', 'alphabetical'],
@ -336,9 +339,12 @@ describe('API tests', () => {
        query: {
          models: 'alphabetical,not',
          agents: 'alphabetical,not',
+          datasetName: 'Test Dataset',
          evaluationType: 'alphabetical,not',
          evalModel: 'alphabetical,not',
          outputIndex: 'outputIndex',
+          projectName: 'Test Project Name',
+          runName: 'Test Run Name',
        },
        signal: undefined,
      });
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api.tsx
@ -26,6 +26,10 @@ export interface FetchConnectorExecuteResponse {
  response: string | ReadableStreamDefaultReader<Uint8Array>;
  isError: boolean;
  isStream: boolean;
+  traceData?: {
+    transactionId: string;
+    traceId: string;
+  };
 }

 export const fetchConnectorExecuteAction = async ({
@ -112,6 +116,10 @@ export const fetchConnectorExecuteAction = async ({
      status: string;
      data: string;
      service_message?: string;
+      trace_data?: {
+        transaction_id: string;
+        trace_id: string;
+      };
    }>(`/internal/elastic_assistant/actions/connector/${apiConfig?.connectorId}/_execute`, {
      method: 'POST',
      body: JSON.stringify(requestBody),
@ -133,10 +141,21 @@ export const fetchConnectorExecuteAction = async ({
        isStream: false,
      };
    }
+
+    // Only add traceData if it exists in the response
+    const traceData =
+      response.trace_data?.trace_id != null && response.trace_data?.transaction_id != null
+        ? {
+            traceId: response.trace_data?.trace_id,
+            transactionId: response.trace_data?.transaction_id,
+          }
+        : undefined;
+
    return {
      response: assistantLangChain ? getFormattedMessageContent(response.data) : response.data,
      isError: false,
      isStream: false,
+      traceData,
    };
  } catch (error) {
    const reader = error?.response?.body?.getReader();
@ -281,6 +300,7 @@ export interface PostEvaluationParams {
 }

 export interface PostEvaluationResponse {
+  evaluationId: string;
  success: boolean;
 }

@ -302,11 +322,14 @@ export const postEvaluation = async ({
  try {
    const path = `/internal/elastic_assistant/evaluate`;
    const query = {
-      models: evalParams?.models.sort()?.join(','),
      agents: evalParams?.agents.sort()?.join(','),
+      datasetName: evalParams?.datasetName,
      evaluationType: evalParams?.evaluationType.sort()?.join(','),
      evalModel: evalParams?.evalModel.sort()?.join(','),
      outputIndex: evalParams?.outputIndex,
+      models: evalParams?.models.sort()?.join(','),
+      projectName: evalParams?.projectName,
+      runName: evalParams?.runName,
    };

    const response = await http.fetch(path, {
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/helpers.ts
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/helpers.ts
@ -22,6 +22,7 @@ export const getMessageFromRawResponse = (rawResponse: FetchConnectorExecuteResp
        : { content: response as string }),
      timestamp: dateTimeString,
      isError,
+      traceData: rawResponse.traceData,
    };
  } else {
    return {
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
@ -7,6 +7,7 @@

 import React, { useCallback, useMemo, useState } from 'react';
 import {
+  EuiAccordion,
  euiPaletteComplementary,
  EuiFormRow,
  EuiTitle,
@ -17,12 +18,14 @@ import {
  EuiButton,
  EuiComboBoxOptionOption,
  EuiTextArea,
+  EuiTextColor,
  EuiFieldText,
  EuiFlexItem,
  EuiFlexGroup,
  EuiLink,
 } from '@elastic/eui';

+import { css } from '@emotion/react';
 import { FormattedMessage } from '@kbn/i18n-react';
 import * as i18n from './translations';
 import { useAssistantContext } from '../../../assistant_context';
@ -30,6 +33,8 @@ import { useLoadConnectors } from '../../../connectorland/use_load_connectors';
 import { getActionTypeTitle, getGenAiConfig } from '../../../connectorland/helpers';
 import { PRECONFIGURED_CONNECTOR } from '../../../connectorland/translations';
 import { usePerformEvaluation } from './use_perform_evaluation';
+import { getApmLink, getDiscoverLink } from './utils';
+import { PostEvaluationResponse } from '../../api';

 /**
 * See AGENT_EXECUTOR_MAP in `x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts`
@ -41,6 +46,7 @@ const DEFAULT_EVAL_TYPES_OPTIONS = [
  { label: 'esql-validator', disabled: true },
  { label: 'custom', disabled: true },
 ];
+const DEFAULT_OUTPUT_INDEX = '.kibana-elastic-ai-assistant-evaluation-results';

 interface Props {
  onEvaluationSettingsChange?: () => void;
@ -52,10 +58,93 @@ interface Props {
 export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSettingsChange }) => {
  const { actionTypeRegistry, basePath, http } = useAssistantContext();
  const { data: connectors } = useLoadConnectors({ http });
-  const { mutate: performEvaluation, isLoading: isPerformingEvaluation } = usePerformEvaluation({
+  const {
+    data: evalResponse,
+    mutate: performEvaluation,
+    isLoading: isPerformingEvaluation,
+  } = usePerformEvaluation({
    http,
  });

+  // Run Details
+  // Project Name
+  const [projectName, setProjectName] = useState();
+  const onProjectNameChange = useCallback(
+    (e) => {
+      setProjectName(e.target.value);
+    },
+    [setProjectName]
+  );
+  // Run Name
+  const [runName, setRunName] = useState();
+  const onRunNameChange = useCallback(
+    (e) => {
+      setRunName(e.target.value);
+    },
+    [setRunName]
+  );
+  // Local Output Index
+  const [outputIndex, setOutputIndex] = useState(DEFAULT_OUTPUT_INDEX);
+  const onOutputIndexChange = useCallback(
+    (e) => {
+      setOutputIndex(e.target.value);
+    },
+    [setOutputIndex]
+  );
+  // Dataset
+  const [useLangSmithDataset, setUseLangSmithDataset] = useState(true);
+  const datasetToggleButton = useMemo(() => {
+    return (
+      <EuiText
+        size={'xs'}
+        css={css`
+          margin-top: 16px;
+        `}
+      >
+        {i18n.EVALUATOR_DATASET_LABEL}
+        {' ('}
+        <EuiLink
+          color={useLangSmithDataset ? 'primary' : 'text'}
+          onClick={() => setUseLangSmithDataset(true)}
+        >
+          {i18n.LANGSMITH_DATASET_LABEL}
+        </EuiLink>
+        {' / '}
+        <EuiLink
+          color={useLangSmithDataset ? 'text' : 'primary'}
+          onClick={() => setUseLangSmithDataset(false)}
+        >
+          {i18n.CUSTOM_DATASET_LABEL}
+        </EuiLink>
+        {')'}
+      </EuiText>
+    );
+  }, [useLangSmithDataset]);
+  const [datasetName, setDatasetName] = useState<string>();
+  const onDatasetNameChange = useCallback(
+    (e) => {
+      setDatasetName(e.target.value);
+    },
+    [setDatasetName]
+  );
+  const sampleDataset = [
+    {
+      input:
+        'As an expert user of Elastic Security, please generate an accurate and valid ESQL query to detect the use case below. Your response should be formatted to be able to use immediately in an Elastic Security timeline or detection rule. Take your time with the answer, and really make sure you check your knowledge really well on all the functions I am asking for. check it multiple times if you need to. I cannot afford for queries to be inaccurate. Assume I am using the Elastic Common Schema. Ensure the answers are formatted in a way which is easily copyable.\n\n' +
+        'Write an ESQL query for detecting cryptomining activity on an AWS EC2 instance.',
+      reference:
+        'FROM metrics-apm*\n| WHERE metricset.name == ""transaction"" AND metricset.interval == ""1m""\n| EVAL bucket = AUTO_BUCKET(transaction.duration.histogram, 50, <start-date>, <end-date>)\n| STATS avg_duration = AVG(transaction.duration.histogram) BY bucket',
+    },
+  ];
+  const [datasetText, setDatasetText] = useState<string>(JSON.stringify(sampleDataset, null, 2));
+  const onDatasetTextChange = useCallback(
+    (e) => {
+      setDatasetText(e.target.value);
+    },
+    [setDatasetText]
+  );
+
+  // Predictions
  // Connectors / Models
  const [selectedModelOptions, setSelectedModelOptions] = useState<
    Array<EuiComboBoxOptionOption<string>>
@ -109,6 +198,7 @@ export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSet
    return DEFAULT_AGENTS.map((label) => ({ label }));
  }, []);

+  // Evaluation
  // Evaluation Type
  const [selectedEvaluationType, setSelectedEvaluationType] = useState<
    Array<EuiComboBoxOptionOption<string>>
@ -146,15 +236,6 @@ export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSet
    [setSelectedEvaluatorModelOptions]
  );

-  // Output Index
-  const [outputIndex, setOutputIndex] = useState('.kibana-elastic-ai-assistant-evaluation-results');
-  const onOutputIndexChange = useCallback(
-    (e) => {
-      setOutputIndex(e.target.value);
-    },
-    [setOutputIndex]
-  );
-
  // Eval Prompt
  const sampleEvalPrompt: string = `For the below input: \n\n{{input}} \n\na prediction: \n\n{{prediction}} \n\nwas made. How's it stack up against this reference: \n\n{{reference}} \n\nReturn output in a succinct sentence ranking on a simple grading rubric focused on correctness.`;
  const [evalPrompt, setEvalPrompt] = useState<string>(sampleEvalPrompt);
@ -165,55 +246,88 @@ export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSet
    [setEvalPrompt]
  );

-  // Dataset
-  const sampleDataset = [
-    {
-      input:
-        'I want to see a query for metrics-apm*, filtering on metricset.name:transaction and metricset.interval:1m, showing the average duration (via transaction.duration.histogram), in 50 buckets. Only return the ESQL query, and do not wrap in a codeblock.',
-      reference:
-        'FROM metrics-apm*\n| WHERE metricset.name == ""transaction"" AND metricset.interval == ""1m""\n| EVAL bucket = AUTO_BUCKET(transaction.duration.histogram, 50, <start-date>, <end-date>)\n| STATS avg_duration = AVG(transaction.duration.histogram) BY bucket',
-    },
-  ];
-  const [datasetText, setDatasetText] = useState<string>(JSON.stringify(sampleDataset, null, 2));
-  const onDatasetTextChange = useCallback(
-    (e) => {
-      setDatasetText(e.target.value);
-    },
-    [setDatasetText]
-  );
-
+  // Required fields by eval API
  const isPerformEvaluationDisabled =
    selectedModelOptions.length === 0 ||
    selectedAgentOptions.length === 0 ||
-    selectedEvaluatorModelOptions.length === 0 ||
-    selectedEvaluationType.length === 0 ||
-    datasetText.length === 0 ||
    outputIndex.length === 0;

  // Perform Evaluation Button
-  const handlePerformEvaluation = useCallback(() => {
+  const handlePerformEvaluation = useCallback(async () => {
    const evalParams = {
      models: selectedModelOptions.flatMap((option) => option.key ?? []),
      agents: selectedAgentOptions.map((option) => option.label),
-      dataset: datasetText,
+      dataset: useLangSmithDataset ? undefined : datasetText,
+      datasetName: useLangSmithDataset ? datasetName : undefined,
      evalModel: selectedEvaluatorModelOptions.flatMap((option) => option.key ?? []),
      evalPrompt,
      evaluationType: selectedEvaluationType.map((option) => option.label),
      outputIndex,
+      projectName,
+      runName,
    };
    performEvaluation(evalParams);
  }, [
+    datasetName,
    datasetText,
    evalPrompt,
    outputIndex,
    performEvaluation,
+    projectName,
+    runName,
    selectedAgentOptions,
    selectedEvaluationType,
    selectedEvaluatorModelOptions,
    selectedModelOptions,
+    useLangSmithDataset,
  ]);

-  const discoverLink = `${basePath}/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:60000),time:(from:now-7d%2Fd,to:now))&_a=(columns:!('@timestamp',evaluationId,totalAgents,totalInput,totalRequests,input,reference,prediction,evaluation.value,evaluation.reasoning,predictionResponse.value.connector_id),filters:!(),grid:(columns:('@timestamp':(width:212),evaluationId:(width:285),totalAgents:(width:111),totalInput:(width:98),totalRequests:(width:121))),index:'6d9ba861-a76b-4d31-90f4-dfb8f01b78bd',interval:auto,query:(esql:'from%20.kibana-elastic-ai-assistant-evaluation-results%20%0A%7C%20keep%20@timestamp,%20evaluationId,%20totalAgents,%20totalInput,%20totalRequests,%20input,%20reference,%20prediction,%20evaluation.value,%20evaluation.reasoning,%20predictionResponse.value.connector_id%0A%7C%20sort%20@timestamp%20desc%0A%7C%20limit%20100%0A%0A%0A'),sort:!(!('@timestamp',desc)))`;
+  const discoverLink = useMemo(
+    () => getDiscoverLink(basePath, (evalResponse as PostEvaluationResponse)?.evaluationId ?? ''),
+    [basePath, evalResponse]
+  );
+
+  const apmLink = useMemo(
+    () => getApmLink(basePath, (evalResponse as PostEvaluationResponse)?.evaluationId ?? ''),
+    [basePath, evalResponse]
+  );
+
+  const getSection = (title: string, description: string) => (
+    <div>
+      <EuiFlexGroup gutterSize="s" alignItems="center" responsive={false}>
+        <EuiFlexItem>
+          <EuiTitle size="xs">
+            <h3>{title}</h3>
+          </EuiTitle>
+        </EuiFlexItem>
+      </EuiFlexGroup>
+
+      <EuiText size="s">
+        <p>
+          <EuiTextColor color="subdued">{description}</EuiTextColor>
+        </p>
+      </EuiText>
+    </div>
+  );
+
+  const runDetailsSection = useMemo(
+    () => getSection(i18n.RUN_DETAILS_TITLE, i18n.RUN_DETAILS_DESCRIPTION),
+    []
+  );
+  const predictionDetailsSection = useMemo(
+    () => getSection(i18n.PREDICTION_DETAILS_TITLE, i18n.PREDICTION_DETAILS_DESCRIPTION),
+    []
+  );
+  const evalDetailsSection = useMemo(
+    () => getSection(i18n.EVALUATION_DETAILS_TITLE, i18n.EVALUATION_DETAILS_DESCRIPTION),
+    []
+  );
+
+  const buttonCss = css`
+    &:hover {
+      text-decoration: none;
+    }
+  `;

  return (
    <>
@ -223,113 +337,193 @@ export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSet
      <EuiSpacer size="xs" />
      <EuiText size={'s'}>{i18n.SETTINGS_DESCRIPTION}</EuiText>
      <EuiHorizontalRule margin={'s'} />
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.CONNECTORS_LABEL}
-        helpText={i18n.CONNECTORS_DESCRIPTION}
+      {/* Run Details*/}
+      <EuiAccordion
+        id={i18n.RUN_DETAILS_TITLE}
+        arrowDisplay={'right'}
+        buttonContent={runDetailsSection}
+        buttonProps={{ paddingSize: 's', css: buttonCss }}
+        element="fieldset"
+        initialIsOpen={true}
+        paddingSize="s"
      >
-        <EuiComboBox
-          aria-label={'model-selector'}
-          compressed
-          options={modelOptions}
-          selectedOptions={selectedModelOptions}
-          onChange={onModelOptionsChange}
-        />
-      </EuiFormRow>
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.AGENTS_LABEL}
-        helpText={i18n.AGENTS_DESCRIPTION}
-      >
-        <EuiComboBox
-          aria-label={'agent-selector'}
-          compressed
-          onCreateOption={onAgentOptionsCreate}
-          options={agentOptions}
-          selectedOptions={selectedAgentOptions}
-          onChange={onAgentOptionsChange}
-        />
-      </EuiFormRow>
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.EVALUATOR_MODEL_LABEL}
-        helpText={i18n.EVALUATOR_MODEL_DESCRIPTION}
-      >
-        <EuiComboBox
-          aria-label={'evaluation-type-select'}
-          compressed
-          options={modelOptions}
-          selectedOptions={selectedEvaluatorModelOptions}
-          singleSelection={{ asPlainText: true }}
-          onChange={onEvaluatorModelOptionsChange}
-        />
-      </EuiFormRow>
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.EVALUATION_TYPE_LABEL}
-        helpText={i18n.EVALUATION_TYPE_DESCRIPTION}
-      >
-        <EuiComboBox
-          aria-label={'evaluation-type-select'}
-          compressed
-          onChange={onEvaluationTypeChange}
-          onCreateOption={onEvaluationTypeOptionsCreate}
-          options={evaluationTypeOptions}
-          selectedOptions={selectedEvaluationType}
-          singleSelection={{ asPlainText: true }}
-        />
-      </EuiFormRow>
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.EVALUATION_PROMPT_LABEL}
-        fullWidth
-        helpText={i18n.EVALUATION_PROMPT_DESCRIPTION}
-      >
-        <EuiTextArea
-          aria-label={'evaluation-prompt-textarea'}
-          compressed
-          disabled={selectedEvaluationType[0]?.label !== 'custom'}
+        <EuiFlexGroup>
+          <EuiFlexItem>
+            <EuiFormRow
+              display="rowCompressed"
+              label={i18n.PROJECT_LABEL}
+              helpText={i18n.PROJECT_DESCRIPTION}
+            >
+              <EuiFieldText
+                aria-label="project-textfield"
+                compressed
+                onChange={onProjectNameChange}
+                placeholder={i18n.PROJECT_PLACEHOLDER}
+                value={projectName}
+              />
+            </EuiFormRow>
+          </EuiFlexItem>
+          <EuiFlexItem>
+            <EuiFormRow
+              display="rowCompressed"
+              label={i18n.RUN_NAME_LABEL}
+              helpText={i18n.RUN_NAME_DESCRIPTION}
+            >
+              <EuiFieldText
+                aria-label="run-name-textfield"
+                compressed
+                onChange={onRunNameChange}
+                placeholder={i18n.RUN_NAME_PLACEHOLDER}
+                value={runName}
+              />
+            </EuiFormRow>
+          </EuiFlexItem>
+        </EuiFlexGroup>
+        <EuiFormRow
+          display="rowCompressed"
+          label={datasetToggleButton}
          fullWidth
-          onChange={onEvalPromptChange}
-          value={evalPrompt}
-        />
-      </EuiFormRow>
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.EVALUATOR_DATASET_LABEL}
-        fullWidth
-        helpText={i18n.EVALUATOR_DATASET_DESCRIPTION}
-      >
-        <EuiTextArea
-          aria-label={'evaluation-dataset-textarea'}
-          compressed
+          helpText={
+            useLangSmithDataset
+              ? i18n.LANGSMITH_DATASET_DESCRIPTION
+              : i18n.CUSTOM_DATASET_DESCRIPTION
+          }
+        >
+          {useLangSmithDataset ? (
+            <EuiFieldText
+              aria-label="dataset-name-textfield"
+              compressed
+              onChange={onDatasetNameChange}
+              placeholder={i18n.LANGSMITH_DATASET_PLACEHOLDER}
+              value={datasetName}
+            />
+          ) : (
+            <EuiTextArea
+              aria-label={'evaluation-dataset-textarea'}
+              compressed
+              css={css`
+                min-height: 300px;
+              `}
+              fullWidth
+              onChange={onDatasetTextChange}
+              value={datasetText}
+            />
+          )}
+        </EuiFormRow>
+        <EuiFormRow
+          display="rowCompressed"
+          label={i18n.EVALUATOR_OUTPUT_INDEX_LABEL}
          fullWidth
-          onChange={onDatasetTextChange}
-          value={datasetText}
-        />
-      </EuiFormRow>
-
-      <EuiFormRow
-        display="rowCompressed"
-        label={i18n.EVALUATOR_OUTPUT_INDEX_LABEL}
-        fullWidth
-        helpText={i18n.EVALUATOR_OUTPUT_INDEX_DESCRIPTION}
+          helpText={i18n.EVALUATOR_OUTPUT_INDEX_DESCRIPTION}
+        >
+          <EuiFieldText
+            value={outputIndex}
+            onChange={onOutputIndexChange}
+            aria-label="evaluation-output-index-textfield"
+          />
+        </EuiFormRow>
+      </EuiAccordion>
+      <EuiHorizontalRule margin={'s'} />
+      {/* Prediction Details*/}
+      <EuiAccordion
+        id={i18n.PREDICTION_DETAILS_TITLE}
+        arrowDisplay={'right'}
+        buttonContent={predictionDetailsSection}
+        buttonProps={{ paddingSize: 's', css: buttonCss }}
+        element="fieldset"
+        initialIsOpen={true}
+        paddingSize="s"
      >
-        <EuiFieldText
-          value={outputIndex}
-          onChange={onOutputIndexChange}
-          aria-label="evaluation-output-index-textfield"
-        />
-      </EuiFormRow>
+        <EuiFormRow
+          display="rowCompressed"
+          label={i18n.CONNECTORS_LABEL}
+          helpText={i18n.CONNECTORS_DESCRIPTION}
+        >
+          <EuiComboBox
+            aria-label={'model-selector'}
+            compressed
+            options={modelOptions}
+            selectedOptions={selectedModelOptions}
+            onChange={onModelOptionsChange}
+          />
+        </EuiFormRow>

+        <EuiFormRow
+          display="rowCompressed"
+          label={i18n.AGENTS_LABEL}
+          helpText={i18n.AGENTS_DESCRIPTION}
+        >
+          <EuiComboBox
+            aria-label={'agent-selector'}
+            compressed
+            onCreateOption={onAgentOptionsCreate}
+            options={agentOptions}
+            selectedOptions={selectedAgentOptions}
+            onChange={onAgentOptionsChange}
+          />
+        </EuiFormRow>
+      </EuiAccordion>
+      <EuiHorizontalRule margin={'s'} />
+      {/* Evaluation Details*/}
+      <EuiAccordion
+        id={i18n.EVALUATION_DETAILS_TITLE}
+        arrowDisplay={'right'}
+        element="fieldset"
+        buttonProps={{ paddingSize: 's', css: buttonCss }}
+        buttonContent={evalDetailsSection}
+        paddingSize="s"
+      >
+        <EuiFormRow
+          display="rowCompressed"
+          label={i18n.EVALUATOR_MODEL_LABEL}
+          helpText={i18n.EVALUATOR_MODEL_DESCRIPTION}
+        >
+          <EuiComboBox
+            aria-label={'evaluation-type-select'}
+            compressed
+            options={modelOptions}
+            selectedOptions={selectedEvaluatorModelOptions}
+            singleSelection={{ asPlainText: true }}
+            onChange={onEvaluatorModelOptionsChange}
+          />
+        </EuiFormRow>
+
+        <EuiFormRow
+          display="rowCompressed"
+          label={i18n.EVALUATION_TYPE_LABEL}
+          helpText={i18n.EVALUATION_TYPE_DESCRIPTION}
+        >
+          <EuiComboBox
+            aria-label={'evaluation-type-select'}
+            compressed
+            onChange={onEvaluationTypeChange}
+            onCreateOption={onEvaluationTypeOptionsCreate}
+            options={evaluationTypeOptions}
+            selectedOptions={selectedEvaluationType}
+            singleSelection={{ asPlainText: true }}
+          />
+        </EuiFormRow>
+
+        <EuiFormRow
+          display="rowCompressed"
+          label={i18n.EVALUATION_PROMPT_LABEL}
+          fullWidth
+          helpText={i18n.EVALUATION_PROMPT_DESCRIPTION}
+        >
+          <EuiTextArea
+            aria-label={'evaluation-prompt-textarea'}
+            compressed
+            css={css`
+              min-height: 330px;
+            `}
+            disabled={selectedEvaluationType[0]?.label !== 'custom'}
+            fullWidth
+            onChange={onEvalPromptChange}
+            value={evalPrompt}
+          />
+        </EuiFormRow>
+      </EuiAccordion>
      <EuiHorizontalRule />
-
      <EuiFlexGroup alignItems="center">
        <EuiFlexItem grow={false}>
          <EuiButton
@ -346,20 +540,24 @@ export const EvaluationSettings: React.FC<Props> = React.memo(({ onEvaluationSet
        <EuiFlexItem>
          <EuiText color={'subdued'} size={'xs'}>
            <FormattedMessage
-              defaultMessage="Fun Facts: Watch the Kibana server logs for progress, and {funFacts} to view the results in Discover once complete. Will take (many) minutes depending on dataset, and closing this dialog will cancel the evaluation!"
+              defaultMessage="Fun Facts: Watch the Kibana server logs for progress, and view results in {discover} / {apm} once complete. Will take (many) minutes depending on dataset, and closing this dialog will cancel the evaluation!"
              id="xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText"
              values={{
-                funFacts: (
+                discover: (
                  <EuiLink external href={discoverLink} target="_blank">
                    {i18n.EVALUATOR_FUN_FACT_DISCOVER_LINK}
                  </EuiLink>
                ),
+                apm: (
+                  <EuiLink external href={apmLink} target="_blank">
+                    {i18n.EVALUATOR_FUN_FACT_APM_LINK}
+                  </EuiLink>
+                ),
              }}
            />
          </EuiText>
        </EuiFlexItem>
      </EuiFlexGroup>
-
      <EuiSpacer size="s" />
    </>
  );
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
@ -17,7 +17,93 @@ export const SETTINGS_DESCRIPTION = i18n.translate(
  'xpack.elasticAssistant.assistant.settings.evaluationSettings.settingsDescription',
  {
    defaultMessage:
-      'Not-so-secret dev UI for evaluating sample datasets against models/agents/more...',
+      'Run predictions and evaluations against test data sets using different models (connectors), agents, and evaluation schemes.',
+  }
+);
+
+export const RUN_DETAILS_TITLE = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsTitle',
+  {
+    defaultMessage: '🏃 Run Details',
+  }
+);
+
+export const RUN_DETAILS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsDescription',
+  {
+    defaultMessage: 'Configure test run details like project, run name, dataset, and output index',
+  }
+);
+
+export const PREDICTION_DETAILS_TITLE = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsTitle',
+  {
+    defaultMessage: '🔮 Predictions',
+  }
+);
+
+export const PREDICTION_DETAILS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsDescription',
+  {
+    defaultMessage:
+      'Choose models (connectors) and corresponding agents the dataset should run against',
+  }
+);
+
+export const EVALUATION_DETAILS_TITLE = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsTitle',
+  {
+    defaultMessage: '🧮 Evaluation (Optional)',
+  }
+);
+
+export const EVALUATION_DETAILS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsDescription',
+  {
+    defaultMessage:
+      'Evaluate prediction results using a specific model (connector) and evaluation criterion',
+  }
+);
+
+export const PROJECT_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.projectLabel',
+  {
+    defaultMessage: 'Project',
+  }
+);
+
+export const PROJECT_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.projectDescription',
+  {
+    defaultMessage: 'LangSmith project to write results to',
+  }
+);
+
+export const PROJECT_PLACEHOLDER = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.projectPlaceholder',
+  {
+    defaultMessage: '8.12 Testing',
+  }
+);
+
+export const RUN_NAME_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.runNameLabel',
+  {
+    defaultMessage: 'Run name',
+  }
+);
+
+export const RUN_NAME_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.runNameDescription',
+  {
+    defaultMessage: 'Name for this specific test run',
+  }
+);
+
+export const RUN_NAME_PLACEHOLDER = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.runNamePlaceholder',
+  {
+    defaultMessage: '8.12 ESQL Query Generation',
  }
 );

@ -114,11 +200,39 @@ export const EVALUATOR_DATASET_LABEL = i18n.translate(
  }
 );

-export const EVALUATOR_DATASET_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetDescription',
+export const LANGSMITH_DATASET_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetLabel',
+  {
+    defaultMessage: 'LangSmith',
+  }
+);
+
+export const LANGSMITH_DATASET_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetDescription',
+  {
+    defaultMessage: 'Name of dataset hosted on LangSmith to evaluate',
+  }
+);
+
+export const LANGSMITH_DATASET_PLACEHOLDER = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetPlaceholder',
+  {
+    defaultMessage: 'ESQL Query Generation',
+  }
+);
+
+export const CUSTOM_DATASET_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetLabel',
+  {
+    defaultMessage: 'Custom',
+  }
+);
+
+export const CUSTOM_DATASET_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetDescription',
  {
    defaultMessage:
-      'Sample data set to evaluate. Array of objects with "input" and "references" properties',
+      'Custom dataset to evaluate. Array of objects with "input" and "references" properties',
  }
 );

@ -132,6 +246,12 @@ export const PERFORM_EVALUATION = i18n.translate(
 export const EVALUATOR_FUN_FACT_DISCOVER_LINK = i18n.translate(
  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText',
  {
-    defaultMessage: 'click here',
+    defaultMessage: 'Discover',
+  }
+);
+export const EVALUATOR_FUN_FACT_APM_LINK = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactApmLinkText',
+  {
+    defaultMessage: 'APM',
  }
 );
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/use_perform_evaluation.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/use_perform_evaluation.tsx
@ -20,12 +20,15 @@ export interface UsePerformEvaluationParams {

 export interface PerformEvaluationParams {
  agents: string[];
-  dataset: string;
+  dataset: string | undefined;
+  datasetName: string | undefined;
  evalModel: string[];
  evalPrompt: string;
  evaluationType: string[];
  models: string[];
  outputIndex: string;
+  projectName: string | undefined;
+  runName: string | undefined;
 }

 /**
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/utils.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/utils.tsx
@ -0,0 +1,25 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Link to Discover for viewing an evaluation
+ *
+ * @param basePath
+ * @param evaluationId
+ */
+export const getDiscoverLink = (basePath: string, evaluationId: string) => {
+  return `${basePath}/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:60000),time:(from:now-1y%2Fd,to:now))&_a=(columns:!(evaluationId,runName,totalAgents,totalInput,totalRequests,input,reference,prediction,evaluation.value,evaluation.reasoning,connectorName,connectorName.keyword,evaluation.__run.runId,evaluation.__run.runId.keyword,evaluation.score,evaluationEnd,evaluationId.keyword,evaluationStart,input.keyword,inputExampleId,inputExampleId.keyword,evaluationDuration,prediction.keyword,predictionResponse.reason.sendToLLM,predictionResponse.status,ConnectorId,predictionResponse.value.data,predictionResponse.value.data.keyword,predictionResponse.value.status,predictionResponse.value.trace_data.trace_id,predictionResponse.value.trace_data.trace_id.keyword,predictionResponse.value.trace_data.transaction_id,predictionResponse.value.trace_data.transaction_id.keyword,reference.keyword,runName.keyword),filters:!(),grid:(columns:('@timestamp':(width:212),ConnectorId:(width:133),connectorName:(width:181),connectorName.keyword:(width:229),evaluation.__run.runId:(width:282),evaluation.__run.runId.keyword:(width:245),evaluation.reasoning:(width:336),evaluation.reasoning.keyword:(width:232),evaluation.score:(width:209),evaluation.value:(width:156),evaluationDuration:(width:174),evaluationEnd:(width:151),evaluationId:(width:130),evaluationId.keyword:(width:186),evaluationStart:(width:202),input:(width:347),input.keyword:(width:458),prediction:(width:264),prediction.keyword:(width:313),predictionResponse.value.connector_id:(width:294),predictionResponse.value.trace_data.trace_id:(width:278),predictionResponse.value.trace_data.transaction_id.keyword:(width:177),reference:(width:305),reference.keyword:(width:219),runName:(width:405),totalAgents:(width:125),totalInput:(width:111),totalRequests:(width:138))),hideChart:!t,index:ce1b41cb-6298-4612-a33c-ba85b3c18ec7,interval:auto,query:(esql:'from%20.kibana-elastic-ai-assistant-evaluation-results%20%0A%7C%20keep%20@timestamp,%20evaluationId,%20runName,%20totalAgents,%20totalInput,%20totalRequests,%20input,%20reference,%20prediction,%20evaluation.value,%20evaluation.reasoning,%20connectorName,%20*%0A%7C%20drop%20evaluation.reasoning.keyword%0A%7C%20rename%20predictionResponse.value.connector_id%20as%20ConnectorId%0A%7C%20where%20evaluationId%20%3D%3D%20%22${evaluationId}%22%0A%7C%20sort%20@timestamp%20desc%0A%7C%20limit%20100%0A%0A%0A'),rowHeight:15,sort:!(!('@timestamp',desc)))`;
+};
+
+/**
+ * Link to APM Trace Explorer for viewing an evaluation
+ * @param basePath
+ * @param evaluationId
+ */
+export const getApmLink = (basePath: string, evaluationId: string) => {
+  return `${basePath}/app/apm/traces/explorer/waterfall?comparisonEnabled=false&detailTab=timeline&environment=ENVIRONMENT_ALL&kuery=&query=%22labels.evaluationId%22:%20%22${evaluationId}%22&rangeFrom=now-1y&rangeTo=now&showCriticalPath=false&traceId=451662121b1f5e6c44084ad7415b9409&transactionId=5f1392fa04766025&type=kql&waterfallItemId=`;
+};
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant_context/types.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant_context/types.tsx
@ -20,6 +20,10 @@ export interface Message {
  timestamp: string;
  isError?: boolean;
  presentation?: MessagePresentation;
+  traceData?: {
+    transactionId: string;
+    traceId: string;
+  };
 }

 export interface ConversationTheme {
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.test.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.test.ts
@ -97,9 +97,13 @@ describe('callAgentExecutor', () => {
      kbResource: ESQL_RESOURCE,
    });

-    expect(mockCall).toHaveBeenCalledWith({
-      input: '\n\nDo you know my name?',
-    });
+    // We don't care about the `config` argument, so we use `expect.anything()`
+    expect(mockCall).toHaveBeenCalledWith(
+      {
+        input: '\n\nDo you know my name?',
+      },
+      expect.anything()
+    );
  });

  it('kicks off the chain with the expected message when langChainMessages has only one entry', async () => {
@ -115,9 +119,13 @@ describe('callAgentExecutor', () => {
      kbResource: ESQL_RESOURCE,
    });

-    expect(mockCall).toHaveBeenCalledWith({
-      input: 'What is my name?',
-    });
+    // We don't care about the `config` argument, so we use `expect.anything()`
+    expect(mockCall).toHaveBeenCalledWith(
+      {
+        input: 'What is my name?',
+      },
+      expect.anything()
+    );
  });

  it('returns the expected response body', async () => {
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.ts
@ -14,7 +14,16 @@ import { ElasticsearchStore } from '../elasticsearch_store/elasticsearch_store';
 import { ActionsClientLlm } from '../llm/actions_client_llm';
 import { KNOWLEDGE_BASE_INDEX_PATTERN } from '../../../routes/knowledge_base/constants';
 import type { AgentExecutorParams, AgentExecutorResponse } from '../executors/types';
+import { withAssistantSpan } from '../tracers/with_assistant_span';
+import { APMTracer } from '../tracers/apm_tracer';

+export const DEFAULT_AGENT_EXECUTOR_ID = 'Elastic AI Assistant Agent Executor';
+
+/**
+ * The default agent executor used by the Elastic AI Assistant. Main agent/chain that wraps the ActionsClientLlm,
+ * sets up a conversation BufferMemory from chat history, and registers tools like the ESQLKnowledgeBaseTool.
+ *
+ */
 export const callAgentExecutor = async ({
  actions,
  connectorId,
@ -25,6 +34,7 @@ export const callAgentExecutor = async ({
  request,
  elserId,
  kbResource,
+  traceOptions,
 }: AgentExecutorParams): AgentExecutorResponse => {
  const llm = new ActionsClientLlm({ actions, connectorId, request, llmType, logger });

@ -58,12 +68,14 @@ export const callAgentExecutor = async ({
  // Create a chain that uses the ELSER backed ElasticsearchStore, override k=10 for esql query generation for now
  const chain = RetrievalQAChain.fromLLM(llm, esStore.asRetriever(10));

+  // TODO: Dependency inject these tools
  const tools: Tool[] = [
    new ChainTool({
-      name: 'esql-language-knowledge-base',
+      name: 'ESQLKnowledgeBaseTool',
      description:
        'Call this for knowledge on how to build an ESQL query, or answer questions about the ES|QL query language.',
      chain,
+      tags: ['esql', 'query-generation', 'knowledge-base'],
    }),
  ];

@ -73,11 +85,37 @@ export const callAgentExecutor = async ({
    verbose: false,
  });

-  await executor.call({ input: latestMessage[0].content });
+  // Sets up tracer for tracing executions to APM. See x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/README.mdx
+  // If LangSmith env vars are set, executions will be traced there as well. See https://docs.smith.langchain.com/tracing
+  const apmTracer = new APMTracer({ projectName: traceOptions?.projectName ?? 'default' }, logger);
+
+  let traceData;
+
+  // Wrap executor call with an APM span for instrumentation
+  await withAssistantSpan(DEFAULT_AGENT_EXECUTOR_ID, async (span) => {
+    if (span?.transaction?.ids['transaction.id'] != null && span?.ids['trace.id'] != null) {
+      traceData = {
+        // Transactions ID since this span is the parent
+        transaction_id: span.transaction.ids['transaction.id'],
+        trace_id: span.ids['trace.id'],
+      };
+      span.addLabels({ evaluationId: traceOptions?.evaluationId });
+    }
+
+    return executor.call(
+      { input: latestMessage[0].content },
+      {
+        callbacks: [apmTracer, ...(traceOptions?.tracers ?? [])],
+        runName: DEFAULT_AGENT_EXECUTOR_ID,
+        tags: traceOptions?.tags ?? [],
+      }
+    );
+  });

  return {
    connector_id: connectorId,
    data: llm.getActionResultData(), // the response from the actions framework
+    trace_data: traceData,
    status: 'ok',
  };
 };
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts
@ -14,6 +14,11 @@ import { ElasticsearchStore } from '../elasticsearch_store/elasticsearch_store';
 import { ActionsClientLlm } from '../llm/actions_client_llm';
 import { KNOWLEDGE_BASE_INDEX_PATTERN } from '../../../routes/knowledge_base/constants';
 import type { AgentExecutorParams, AgentExecutorResponse } from './types';
+import { withAssistantSpan } from '../tracers/with_assistant_span';
+import { APMTracer } from '../tracers/apm_tracer';
+
+export const OPEN_AI_FUNCTIONS_AGENT_EXECUTOR_ID =
+  'Elastic AI Assistant Agent Executor (OpenAI Functions)';

 /**
 * This is an agent executor to be used with the model evaluation API for benchmarking.
@ -25,11 +30,13 @@ export const callOpenAIFunctionsExecutor = async ({
  actions,
  connectorId,
  esClient,
-  elserId,
  langChainMessages,
  llmType,
  logger,
  request,
+  elserId,
+  kbResource,
+  traceOptions,
 }: AgentExecutorParams): AgentExecutorResponse => {
  const llm = new ActionsClientLlm({ actions, connectorId, request, llmType, logger });

@ -45,15 +52,32 @@ export const callOpenAIFunctionsExecutor = async ({
  });

  // ELSER backed ElasticsearchStore for Knowledge Base
-  const esStore = new ElasticsearchStore(esClient, KNOWLEDGE_BASE_INDEX_PATTERN, logger, elserId);
-  const chain = RetrievalQAChain.fromLLM(llm, esStore.asRetriever());
+  const esStore = new ElasticsearchStore(
+    esClient,
+    KNOWLEDGE_BASE_INDEX_PATTERN,
+    logger,
+    elserId,
+    kbResource
+  );

+  const modelExists = await esStore.isModelInstalled();
+  if (!modelExists) {
+    throw new Error(
+      'Please ensure ELSER is configured to use the Knowledge Base, otherwise disable the Knowledge Base in Advanced Settings to continue.'
+    );
+  }
+
+  // Create a chain that uses the ELSER backed ElasticsearchStore, override k=10 for esql query generation for now
+  const chain = RetrievalQAChain.fromLLM(llm, esStore.asRetriever(10));
+
+  // TODO: Dependency inject these tools
  const tools: Tool[] = [
    new ChainTool({
-      name: 'esql-language-knowledge-base',
+      name: 'ESQLKnowledgeBaseTool',
      description:
        'Call this for knowledge on how to build an ESQL query, or answer questions about the ES|QL query language.',
      chain,
+      tags: ['esql', 'query-generation', 'knowledge-base'],
    }),
  ];

@ -63,11 +87,37 @@ export const callOpenAIFunctionsExecutor = async ({
    verbose: false,
  });

-  await executor.call({ input: latestMessage[0].content });
+  // Sets up tracer for tracing executions to APM. See x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/README.mdx
+  // If LangSmith env vars are set, executions will be traced there as well. See https://docs.smith.langchain.com/tracing
+  const apmTracer = new APMTracer({ projectName: traceOptions?.projectName ?? 'default' }, logger);
+
+  let traceData;
+
+  // Wrap executor call with an APM span for instrumentation
+  await withAssistantSpan(OPEN_AI_FUNCTIONS_AGENT_EXECUTOR_ID, async (span) => {
+    if (span?.transaction?.ids['transaction.id'] != null && span?.ids['trace.id'] != null) {
+      traceData = {
+        // Transactions ID since this span is the parent
+        transaction_id: span.transaction.ids['transaction.id'],
+        trace_id: span.ids['trace.id'],
+      };
+      span.addLabels({ evaluationId: traceOptions?.evaluationId });
+    }
+
+    return executor.call(
+      { input: latestMessage[0].content },
+      {
+        callbacks: [apmTracer, ...(traceOptions?.tracers ?? [])],
+        runName: OPEN_AI_FUNCTIONS_AGENT_EXECUTOR_ID,
+        tags: traceOptions?.tags ?? [],
+      }
+    );
+  });

  return {
    connector_id: connectorId,
    data: llm.getActionResultData(), // the response from the actions framework
+    trace_data: traceData,
    status: 'ok',
  };
 };
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/types.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/types.ts
@ -10,6 +10,7 @@ import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
 import { BaseMessage } from 'langchain/schema';
 import { Logger } from '@kbn/logging';
 import { KibanaRequest } from '@kbn/core-http-server';
+import type { LangChainTracer } from 'langchain/callbacks';
 import { RequestBody, ResponseBody } from '../types';

 export interface AgentExecutorParams {
@ -22,10 +23,31 @@ export interface AgentExecutorParams {
  logger: Logger;
  request: KibanaRequest<unknown, unknown, RequestBody>;
  elserId?: string;
+  traceOptions?: TraceOptions;
 }

 export type AgentExecutorResponse = Promise<ResponseBody>;

 export type AgentExecutor = (params: AgentExecutorParams) => AgentExecutorResponse;

-export type AgentExecutorEvaluator = (langChainMessages: BaseMessage[]) => AgentExecutorResponse;
+export type AgentExecutorEvaluator = (
+  langChainMessages: BaseMessage[],
+  exampleId?: string
+) => AgentExecutorResponse;
+
+export interface AgentExecutorEvaluatorWithMetadata {
+  agentEvaluator: AgentExecutorEvaluator;
+  metadata: {
+    connectorName: string;
+    runName: string;
+  };
+}
+
+export interface TraceOptions {
+  evaluationId?: string;
+  exampleId?: string;
+  projectName?: string;
+  runName?: string;
+  tags?: string[];
+  tracers?: LangChainTracer[];
+}
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/llm/actions_client_llm.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/llm/actions_client_llm.ts
@ -5,6 +5,7 @@
 * 2.0.
 */

+import { v4 as uuidv4 } from 'uuid';
 import { KibanaRequest, Logger } from '@kbn/core/server';
 import type { PluginStartContract as ActionsPluginStart } from '@kbn/actions-plugin/server';
 import { LLM } from 'langchain/llms/base';
@ -15,12 +16,22 @@ import { RequestBody } from '../types';

 const LLM_TYPE = 'ActionsClientLlm';

+interface ActionsClientLlmParams {
+  actions: ActionsPluginStart;
+  connectorId: string;
+  llmType?: string;
+  logger: Logger;
+  request: KibanaRequest<unknown, unknown, RequestBody>;
+  traceId?: string;
+}
+
 export class ActionsClientLlm extends LLM {
  #actions: ActionsPluginStart;
  #connectorId: string;
  #logger: Logger;
  #request: KibanaRequest<unknown, unknown, RequestBody>;
  #actionResultData: string;
+  #traceId: string;

  // Local `llmType` as it can change and needs to be accessed by abstract `_llmType()` method
  // Not using getter as `this._llmType()` is called in the constructor via `super({})`
@ -29,20 +40,16 @@ export class ActionsClientLlm extends LLM {
  constructor({
    actions,
    connectorId,
+    traceId = uuidv4(),
    llmType,
    logger,
    request,
-  }: {
-    actions: ActionsPluginStart;
-    connectorId: string;
-    llmType?: string;
-    logger: Logger;
-    request: KibanaRequest<unknown, unknown, RequestBody>;
-  }) {
+  }: ActionsClientLlmParams) {
    super({});

    this.#actions = actions;
    this.#connectorId = connectorId;
+    this.#traceId = traceId;
    this.llmType = llmType ?? LLM_TYPE;
    this.#logger = logger;
    this.#request = request;
@ -68,7 +75,9 @@ export class ActionsClientLlm extends LLM {
    // convert the Langchain prompt to an assistant message:
    const assistantMessage = getMessageContentAndRole(prompt);
    this.#logger.debug(
-      `ActionsClientLlm#_call assistantMessage:\n${JSON.stringify(assistantMessage)} `
+      `ActionsClientLlm#_call\ntraceId: ${this.#traceId}\nassistantMessage:\n${JSON.stringify(
+        assistantMessage
+      )} `
    );
    // create a new connector request body with the assistant message:
    const requestBody = {
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/README.mdx
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/README.mdx
@ -0,0 +1,68 @@
+### Tracing LangChain Retrievers, LLMs, Chains, and Tools using Elastic APM and LangSmith
+
+This document describes how to trace LangChain retrievers, LLMs, chains, and tools using Elastic APM and LangSmith.
+
+If the `assistantModelEvaluation` experimental feature flag is enabled, and an APM server is configured, messages that have a corresponding trace will have an additional `View APM trace` action in the message title bar:
+
+<p align="center">
+  <img width="500" src="https://github.com/elastic/kibana/assets/2946766/e0b372ee-139a-4eed-8b09-f01dd88c72b0" />
+</p> 
+
+Viewing the trace you can see a breakdown of the time spent in each retriever, llm, chain, and tool:
+<p align="center">
+  <img width="500" src="https://github.com/elastic/kibana/assets/2946766/f7cbd4bc-207c-4c88-a032-70a8de4f9b9a" />
+</p> 
+
+The Evaluation interface has been updated to support adding additional metadata like `Project Name`, `Run Name`, and pulling test datasets from LangSmith. Predictions can now also be run without having to run an Evaluation, so datasets can quickly be run for manual analysis.
+
+<p align="center">
+  <img width="500" src="https://github.com/elastic/kibana/assets/2946766/acebf719-29fd-4fcc-aef1-99fd00ca800a" />
+</p> 
+
+
+<p align="center">
+  <img width="500" src="https://github.com/elastic/kibana/assets/2946766/7081d993-cbe0-4465-a734-ff9be14d7d0d" />
+</p> 
+
+
+### Configuring APM
+
+First, enable the `assistantModelEvaluation` experimental feature flag by adding the following to your `kibana.dev.yml`:
+
+```
+xpack.securitySolution.enableExperimental: [ 'assistantModelEvaluation' ]
+```
+
+Next, you'll need an APM server to collect the traces. You can either [follow the documentation for installing](https://www.elastic.co/guide/en/apm/guide/current/installing.html) the released artifact, or [run from source](https://github.com/elastic/apm-server#apm-server-development) and set up using the [quickstart guide provided](https://www.elastic.co/guide/en/apm/guide/current/apm-quick-start.html) (be sure to install the APM Server integration to ensure the necessary indices are created!).  Once your APM server is running, add your APM server configuration to your `kibana.dev.yml` as well using the following:
+
+```
+# APM
+elastic.apm:
+  active: true
+  environment: 'SpongBox5002c™'
+  serverUrl: 'http://localhost:8200'
+  transactionSampleRate: 1.0
+  breakdownMetrics: true
+  spanStackTraceMinDuration: 10ms
+  # Disables Kibana RUM
+  servicesOverrides.kibana-frontend.active: false
+```
+
+> [!NOTE]
+>  If connecting to a cloud APM server (like our [ai-assistant apm deployment](https://ai-assistant-apm-do-not-delete.kb.us-central1.gcp.cloud.es.io/)), follow [these steps](https://www.elastic.co/guide/en/apm/guide/current/api-key.html#create-an-api-key) to create an API key, and then set it via `apiKey` and also set your `serverUrl` as shown in the APM Integration details within fleet. Note that the `View APM trace` button within the UI will link to your local instance, not the cloud instance.
+
+> [!NOTE]
+>  If you're an Elastic developer running Kibana from source, you can just enable APM as above, and _not_ include a `serverUrl`, and your traces will be sent to the https://kibana-cloud-apm.elastic.dev cluster. Note that the `View APM trace` button within the UI will link to your local instance, not the cloud instance.
+
+### Configuring LangSmith
+
+If wanting to push traces to LangSmith, or leverage any datasets that you may have hosted in a project, all you need to do is configure a few environment variables, and then start the kibana server. See the [LangSmith Traces documentation](https://docs.smith.langchain.com/tracing) for details, or just add the below env variables to enable:
+
+```
+# LangChain LangSmith
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
+export LANGCHAIN_API_KEY=""
+export LANGCHAIN_PROJECT="8.12 ESQL Query Generation"
+```
+
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/apm_tracer.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/apm_tracer.ts
@ -0,0 +1,148 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { BaseCallbackHandlerInput, BaseTracer, Run } from 'langchain/callbacks';
+import agent from 'elastic-apm-node';
+import type { Logger } from '@kbn/core/server';
+
+export interface LangChainTracerFields extends BaseCallbackHandlerInput {
+  exampleId?: string;
+  projectName?: string;
+}
+
+type Span = Exclude<typeof agent.currentSpan, undefined | null>;
+
+/**
+ * APMTracer is a tracer that uses the Elastic APM agent to trace langchain retrievers, llms, chains, and tools.
+ */
+export class APMTracer extends BaseTracer implements LangChainTracerFields {
+  name = 'apm_tracer';
+  projectName?: string;
+  exampleId?: string;
+  logger: Logger;
+
+  retrieverSpans: Span[] = [];
+  llmSpans: Span[] = [];
+  chainSpans: Span[] = [];
+  toolSpans: Span[] = [];
+
+  constructor(fields: LangChainTracerFields = {}, logger: Logger) {
+    super(fields);
+    const { exampleId, projectName } = fields;
+
+    this.projectName = projectName ?? 'default';
+    this.exampleId = exampleId;
+    this.logger = logger;
+  }
+
+  protected async persistRun(_run: Run): Promise<void> {}
+
+  /**
+   * LangChain Run's contain a lot of useful information, so here we unpack as much of it as we can
+   * into labels that can be added to the corresponding span. Stringifying outputs at the moment since
+   * the Run schema is a loose KVMap, but we should more elegantly unpack relevant data that we find useful
+   *
+   * See BaseRun interface Run extends from
+   *
+   * @param run
+   * @protected
+   */
+  protected _getLabelsFromRun(run: Run): agent.Labels {
+    try {
+      return {
+        tags: JSON.stringify(run.tags),
+        outputs: JSON.stringify(run.outputs),
+        events: JSON.stringify(run.events),
+        inputs: JSON.stringify(run.inputs),
+      };
+    } catch (e) {
+      this.logger.error(`Error parsing run into labels:\n${e}`);
+      return {};
+    }
+  }
+
+  protected createAndAddSpanFromRun(run: Run, spans: Span[]) {
+    const span = agent.startSpan(run.name) ?? undefined;
+
+    if (span) {
+      span.addLabels(this._getLabelsFromRun(run));
+      spans.push(span);
+    }
+  }
+
+  async onRetrieverStart(run: Run): Promise<void> {
+    this.logger.debug(`onRetrieverStart: run:\n${JSON.stringify(run, null, 2)}`);
+    this.createAndAddSpanFromRun(run, this.retrieverSpans);
+  }
+
+  async onRetrieverEnd(run: Run): Promise<void> {
+    this.logger.debug(`onRetrieverEnd: run:\n${JSON.stringify(run, null, 2)}`);
+    const span = this.retrieverSpans.pop();
+    if (span != null) {
+      span.addLabels(this._getLabelsFromRun(run));
+      span.end();
+    }
+  }
+
+  async onRetrieverError(run: Run): Promise<void> {
+    this.logger.debug(`onRetrieverError: run:\n${JSON.stringify(run, null, 2)}`);
+  }
+
+  async onLLMStart(run: Run): Promise<void> {
+    this.logger.debug(`onLLMStart: run:\n${JSON.stringify(run, null, 2)}`);
+    this.createAndAddSpanFromRun(run, this.llmSpans);
+  }
+
+  async onLLMEnd(run: Run): Promise<void> {
+    this.logger.debug(`onLLMEnd: run:\n${JSON.stringify(run, null, 2)}`);
+    const span = this.llmSpans.pop();
+    if (span != null) {
+      span.addLabels(this._getLabelsFromRun(run));
+      span.end();
+    }
+  }
+
+  async onLLMError(run: Run): Promise<void> {
+    this.logger.debug(`onLLMError: run:\n${JSON.stringify(run, null, 2)}`);
+  }
+
+  async onChainStart(run: Run): Promise<void> {
+    this.logger.debug(`onChainStart: run:\n${JSON.stringify(run, null, 2)}`);
+    this.createAndAddSpanFromRun(run, this.chainSpans);
+  }
+
+  async onChainEnd(run: Run): Promise<void> {
+    this.logger.debug(`onChainEnd: run:\n${JSON.stringify(run, null, 2)}`);
+    const span = this.chainSpans.pop();
+    if (span != null) {
+      span.addLabels(this._getLabelsFromRun(run));
+      span.end();
+    }
+  }
+
+  async onChainError(run: Run): Promise<void> {
+    this.logger.debug(`onChainError: run:\n${JSON.stringify(run, null, 2)}`);
+  }
+
+  async onToolStart(run: Run): Promise<void> {
+    this.logger.debug(`onToolStart: run:\n${JSON.stringify(run, null, 2)}`);
+    this.createAndAddSpanFromRun(run, this.toolSpans);
+  }
+
+  async onToolEnd(run: Run): Promise<void> {
+    this.logger.debug(`onToolEnd: run:\n${JSON.stringify(run, null, 2)}`);
+    const span = this.toolSpans.pop();
+    if (span != null) {
+      span.addLabels(this._getLabelsFromRun(run));
+      span.end();
+    }
+  }
+
+  async onToolError(run: Run): Promise<void> {
+    this.logger.debug(`onToolError: run:\n${JSON.stringify(run, null, 2)}`);
+  }
+}
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/with_assistant_span.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/with_assistant_span.ts
@ -0,0 +1,36 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+import type { SpanOptions } from '@kbn/apm-utils';
+import { withSpan } from '@kbn/apm-utils';
+import type agent from 'elastic-apm-node';
+
+type Span = Exclude<typeof agent.currentSpan, undefined | null>;
+
+/**
+ * This is a thin wrapper around withSpan from @kbn/apm-utils, which sets
+ * span type to 'elasticAssistant' by default. This span type is used to
+ * distinguish assistant spans from everything else when inspecting traces.
+ *
+ * Use this method to capture information about the execution of a specific
+ * code path and highlight it in APM UI.
+ *
+ * @param optionsOrName Span name or span options object
+ * @param cb Code block you want to measure
+ *
+ * @returns Whatever the measured code block returns
+ */
+export const withAssistantSpan = <T>(
+  optionsOrName: SpanOptions | string,
+  cb: (span?: Span) => Promise<T>
+) =>
+  withSpan<T>(
+    {
+      type: 'elasticAssistant',
+      ...(typeof optionsOrName === 'string' ? { name: optionsOrName } : optionsOrName),
+    },
+    cb
+  );
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/types.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/types.ts
@ -13,4 +13,8 @@ export interface ResponseBody {
  status: string;
  data: string;
  connector_id: string;
+  trace_data?: {
+    transaction_id: string;
+    trace_id: string;
+  };
 }
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts
@ -8,34 +8,40 @@
 import { loadEvaluator } from 'langchain/evaluation';
 import { LLM } from 'langchain/llms/base';
 import { ChainValues, HumanMessage } from 'langchain/schema';
-import { chunk } from 'lodash/fp';
+import { chunk as createChunks } from 'lodash/fp';
 import { Logger } from '@kbn/core/server';
 import { ToolingLog } from '@kbn/tooling-log';
-import { asyncForEach } from '@kbn/std';
-import { AgentExecutorEvaluator } from '../langchain/executors/types';
+import { LangChainTracer, RunCollectorCallbackHandler } from 'langchain/callbacks';
+import { AgentExecutorEvaluatorWithMetadata } from '../langchain/executors/types';
 import { Dataset } from '../../schemas/evaluate/post_evaluate';
-import { callAgentWithRetry, getMessageFromLangChainResponse, wait } from './utils';
+import { callAgentWithRetry, getMessageFromLangChainResponse } from './utils';
 import { ResponseBody } from '../langchain/types';
+import { isLangSmithEnabled, writeLangSmithFeedback } from '../../routes/evaluate/utils';

 export interface PerformEvaluationParams {
-  agentExecutorEvaluators: AgentExecutorEvaluator[];
+  agentExecutorEvaluators: AgentExecutorEvaluatorWithMetadata[];
  dataset: Dataset;
  evaluationId: string;
-  evaluatorModel: LLM;
+  evaluatorModel?: LLM;
  evaluationPrompt?: string;
-  evaluationType: string;
-  maxConcurrency?: number;
+  evaluationType?: string;
  logger: Logger | ToolingLog;
+  maxConcurrency?: number;
+  runName?: string;
 }

 export interface EvaluationResult {
  '@timestamp': string;
+  connectorName: string;
  evaluation: ChainValues;
  evaluationId: string;
  input: string;
+  inputExampleId?: string | undefined;
+  langSmithLink?: string | undefined;
  prediction: string;
  predictionResponse: PromiseSettledResult<ResponseBody>;
  reference: string;
+  runName: string;
 }

 export interface EvaluationSummary {
@ -44,6 +50,8 @@ export interface EvaluationSummary {
  evaluationEnd: number;
  evaluationId: string;
  evaluationDuration: number;
+  langSmithLink?: string | undefined;
+  runName: string;
  totalAgents: number;
  totalRequests: number;
  totalInput: number;
@ -61,45 +69,85 @@ export const performEvaluation = async ({
  evaluatorModel,
  evaluationPrompt,
  evaluationType,
-  maxConcurrency = 3,
+  maxConcurrency = 1,
  logger,
+  runName = 'default-run-name',
 }: PerformEvaluationParams) => {
  const startTime = new Date().getTime();
  const evaluationResults: EvaluationResult[] = [];

-  const predictionRequests = dataset.flatMap(({ input, reference }) =>
-    agentExecutorEvaluators.map((agent) => ({
-      input,
-      reference,
-      request: callAgentWithRetry({ agent, messages: [new HumanMessage(input)], logger }),
-    }))
+  const predictionRequests = dataset.flatMap(({ input, reference, id: exampleId }) =>
+    agentExecutorEvaluators.map(
+      ({ agentEvaluator: agent, metadata: { connectorName, runName: agentRunName } }) => ({
+        connectorName,
+        input,
+        reference,
+        exampleId,
+        request: () =>
+          callAgentWithRetry({ agent, exampleId, messages: [new HumanMessage(input)], logger }),
+        runName: agentRunName,
+      })
+    )
  );

+  const requestChunks = createChunks(maxConcurrency, predictionRequests);
+  const totalChunks = requestChunks.length;
+
  logger.info(`Total prediction requests: ${predictionRequests.length}`);
-  logger.info(`Chunk size: ${maxConcurrency}`);
+  logger.info(`Chunk size (maxConcurrency): ${maxConcurrency}`);
+  logger.info(`Total chunks: ${totalChunks}`);
  logger.info('Fetching predictions...');
-  const requestChunks = chunk(maxConcurrency, predictionRequests);
-  await asyncForEach(requestChunks, async (c, i) => {
-    logger.info(`Prediction request chunk: ${i + 1} of ${requestChunks.length}`);
+
+  while (requestChunks.length) {
+    const chunk = requestChunks.shift() ?? [];
+    const chunkNumber = totalChunks - requestChunks.length;
+    logger.info(`Prediction request chunk: ${chunkNumber} of ${totalChunks}`);
+    logger.debug(chunk);

    // Note, order is kept between chunk and dataset, and is preserved w/ Promise.allSettled
-    const chunkResults = await Promise.allSettled(c.map((r) => r.request));
-    logger.info(`Prediction request chunk ${i + 1} response:\n${JSON.stringify(chunkResults)}`);
+    const chunkResults = await Promise.allSettled(chunk.map((r) => r.request()));
+    logger.info(
+      `Prediction request chunk ${chunkNumber} response:\n${JSON.stringify(chunkResults)}`
+    );
    chunkResults.forEach((response, chunkResultIndex) =>
      evaluationResults.push({
        '@timestamp': new Date().toISOString(),
-        input: c[chunkResultIndex].input,
-        reference: c[chunkResultIndex].reference,
+        connectorName: chunk[chunkResultIndex].connectorName,
+        input: chunk[chunkResultIndex].input,
+        inputExampleId: chunk[chunkResultIndex].exampleId,
+        reference: chunk[chunkResultIndex].reference,
        evaluationId,
        evaluation: {},
        prediction: getMessageFromLangChainResponse(response),
        predictionResponse: response,
+        runName: chunk[chunkResultIndex].runName,
      })
    );
-  });
+  }

  logger.info(`Prediction results:\n${JSON.stringify(evaluationResults)}`);

+  if (evaluatorModel == null) {
+    const endTime = new Date().getTime();
+
+    const evaluationSummary: EvaluationSummary = {
+      evaluationId,
+      '@timestamp': new Date().toISOString(),
+      evaluationStart: startTime,
+      evaluationEnd: endTime,
+      evaluationDuration: endTime - startTime,
+      runName,
+      totalAgents: agentExecutorEvaluators.length,
+      totalInput: dataset.length,
+      totalRequests: predictionRequests.length,
+    };
+
+    logger.info(`Final results:\n${JSON.stringify(evaluationResults)}`);
+
+    return { evaluationResults, evaluationSummary };
+  }
+
+  // Continue with actual evaluation if expected
  logger.info('Performing evaluation....');
  logger.info(`Evaluation model: ${evaluatorModel._llmType()}`);

@ -109,22 +157,53 @@ export const performEvaluation = async ({
      criteria: 'correctness',
      llm: evaluatorModel,
    });
-    await asyncForEach(evaluationResults, async ({ input, prediction, reference }, index) => {
-      // TODO: Rate limit evaluator calls, though haven't seen any `429`'s yet in testing datasets up to 10 w/ azure/bedrock
-      const evaluation = await evaluator.evaluateStrings({
-        input,
-        prediction,
-        reference,
+
+    for (const result of evaluationResults) {
+      const { input, inputExampleId: exampleId, prediction, reference } = result;
+      // Create an eval tracer so eval traces end up in the right project (runName in this instance as to correlate
+      // with the test run), don't supply `exampleID` as that results in a new Dataset `Test` run being created and
+      // polluting the `predictions` that ran above
+      const evalTracer = new LangChainTracer({
+        projectName: runName,
      });
-      evaluationResults[index].evaluation = evaluation;
-      await wait(1000);
-    });
+      // Create RunCollector for uploading evals to LangSmith, no TS variant for `EvaluatorCallbackHandler` or
+      // `run_on_dataset` w/ eval config, so using `RunCollectorCallbackHandler` and then uploading manually via
+      // client.createFeedback()
+      // See: https://github.com/langchain-ai/langsmith-sdk/blob/18449e5848d85ac0a320f320c37f454f949de1e1/js/src/client.ts#L1249-L1256
+      const runCollector = new RunCollectorCallbackHandler({ exampleId });
+      const evaluation = await evaluator.evaluateStrings(
+        {
+          input,
+          prediction,
+          reference,
+        },
+        {
+          callbacks: [...(isLangSmithEnabled() ? [evalTracer, runCollector] : [])],
+          tags: ['security-assistant-evaluation'],
+        }
+      );
+      result.evaluation = evaluation;
+
+      // Write to LangSmith
+      if (isLangSmithEnabled()) {
+        const langSmithLink = await writeLangSmithFeedback(
+          runCollector.tracedRuns[0],
+          evaluationId,
+          logger
+        );
+        result.langSmithLink = langSmithLink;
+      }
+    }
  } else if (evaluationType === 'esql-validator') {
    logger.info('Evaluation type: esql-validator');
    // TODO: Implement esql-validator here
  } else if (evaluationType === 'custom') {
    logger.info('Evaluation type: custom');
    // TODO: Implement custom evaluation here
+    // const llm = new ChatOpenAI({ temperature: 0, tags: ["my-llm-tag"] });
+    // const prompt = PromptTemplate.fromTemplate("Say {input}");
+    // const chain = prompt.pipe(llm).withConfig( { tags: ["my-bash-tag", "another-tag"] });
+    // await chain.invoke({ input: "Hello, World!"}, { tags: ["shared-tags"] });
  }

  const endTime = new Date().getTime();
@ -135,12 +214,13 @@ export const performEvaluation = async ({
    evaluationStart: startTime,
    evaluationEnd: endTime,
    evaluationDuration: endTime - startTime,
+    runName,
    totalAgents: agentExecutorEvaluators.length,
    totalInput: dataset.length,
    totalRequests: predictionRequests.length,
  };

-  logger.info(`Final results:\n${JSON.stringify(evaluationResults, null, 2)}`);
+  logger.info(`Final results:\n${JSON.stringify(evaluationResults)}`);

  return { evaluationResults, evaluationSummary };
 };
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts
@ -15,19 +15,21 @@ export const wait = (ms: number) => new Promise((resolve) => setTimeout(resolve,

 export interface CallAgentWithRetryParams {
  agent: AgentExecutorEvaluator;
+  exampleId?: string;
  messages: BaseMessage[];
  logger: Logger | ToolingLog;
  maxRetries?: number;
 }
 export const callAgentWithRetry = async ({
  agent,
+  exampleId,
  messages,
  logger,
  maxRetries = 3,
 }: CallAgentWithRetryParams) => {
  for (let attempt = 0; attempt < maxRetries; attempt++) {
    try {
-      return await agent(messages);
+      return await agent(messages, exampleId);
    } catch (error) {
      // Check for 429, and then if there is a retry-after header
      const { isRateLimitError, retryAfter } = parseErrorMessage(error);
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
@ -18,13 +18,16 @@ import { PostEvaluateBody, PostEvaluatePathQuery } from '../../schemas/evaluate/
 import { performEvaluation } from '../../lib/model_evaluator/evaluation';
 import { callAgentExecutor } from '../../lib/langchain/execute_custom_llm_chain';
 import { callOpenAIFunctionsExecutor } from '../../lib/langchain/executors/openai_functions_executor';
-import { AgentExecutor, AgentExecutorEvaluator } from '../../lib/langchain/executors/types';
+import {
+  AgentExecutor,
+  AgentExecutorEvaluatorWithMetadata,
+} from '../../lib/langchain/executors/types';
 import { ActionsClientLlm } from '../../lib/langchain/llm/actions_client_llm';
 import {
  indexEvaluations,
  setupEvaluationIndex,
 } from '../../lib/model_evaluator/output_index/utils';
-import { getLlmType } from './utils';
+import { fetchLangSmithDataset, getConnectorName, getLangSmithTracer, getLlmType } from './utils';
 import { RequestBody } from '../../lib/langchain/types';

 /**
@ -50,34 +53,41 @@ export const postEvaluateRoute = (
    },
    async (context, request, response) => {
      // TODO: Limit route based on experimental feature
-      const resp = buildResponse(response);
      const logger: Logger = (await context.elasticAssistant).logger;
-
-      const { evalModel, evaluationType, outputIndex } = request.query;
-      const { dataset, evalPrompt } = request.body;
-      const connectorIds = request.query.models?.split(',') || [];
-      const agentNames = request.query.agents?.split(',') || [];
-
-      const evaluationId = uuidv4();
-
-      logger.info('postEvaluateRoute:');
-      logger.info(`request.query:\n${JSON.stringify(request.query, null, 2)}`);
-      logger.info(`request.body:\n${JSON.stringify(request.body, null, 2)}`);
-      logger.info(`Evaluation ID: ${evaluationId}`);
-
-      const totalExecutions = connectorIds.length * agentNames.length * dataset.length;
-      logger.info('Creating agents:');
-      logger.info(`\tconnectors/models: ${connectorIds.length}`);
-      logger.info(`\tagents: ${agentNames.length}`);
-      logger.info(`\tdataset: ${dataset.length}`);
-      logger.warn(`\ttotal baseline agent executions: ${totalExecutions} `);
-      if (totalExecutions > 50) {
-        logger.warn(
-          `Total baseline agent executions >= 50! This may take a while, and cost some money...`
-        );
-      }
-
      try {
+        const evaluationId = uuidv4();
+        const {
+          evalModel,
+          evaluationType,
+          outputIndex,
+          datasetName,
+          projectName = 'default',
+          runName = evaluationId,
+        } = request.query;
+        const { dataset: customDataset = [], evalPrompt } = request.body;
+        const connectorIds = request.query.models?.split(',') || [];
+        const agentNames = request.query.agents?.split(',') || [];
+
+        const dataset =
+          datasetName != null ? await fetchLangSmithDataset(datasetName, logger) : customDataset;
+
+        logger.info('postEvaluateRoute:');
+        logger.info(`request.query:\n${JSON.stringify(request.query, null, 2)}`);
+        logger.info(`request.body:\n${JSON.stringify(request.body, null, 2)}`);
+        logger.info(`Evaluation ID: ${evaluationId}`);
+
+        const totalExecutions = connectorIds.length * agentNames.length * dataset.length;
+        logger.info('Creating agents:');
+        logger.info(`\tconnectors/models: ${connectorIds.length}`);
+        logger.info(`\tagents: ${agentNames.length}`);
+        logger.info(`\tdataset: ${dataset.length}`);
+        logger.warn(`\ttotal baseline agent executions: ${totalExecutions} `);
+        if (totalExecutions > 50) {
+          logger.warn(
+            `Total baseline agent executions >= 50! This may take a while, and cost some money...`
+          );
+        }
+
        // Get the actions plugin start contract from the request context for the agents
        const actions = (await context.elasticAssistant).actions;

@ -112,35 +122,58 @@ export const postEvaluateRoute = (
        // Create an array of executor functions to call in batches
        // One for each connector/model + agent combination
        // Hoist `langChainMessages` so they can be batched by dataset.input in the evaluator
-        const agents: AgentExecutorEvaluator[] = [];
+        const agents: AgentExecutorEvaluatorWithMetadata[] = [];
        connectorIds.forEach((connectorId) => {
          agentNames.forEach((agentName) => {
            logger.info(`Creating agent: ${connectorId} + ${agentName}`);
            const llmType = getLlmType(connectorId, connectors);
-            agents.push((langChainMessages) =>
-              AGENT_EXECUTOR_MAP[agentName]({
-                actions,
-                connectorId,
-                esClient,
-                elserId,
-                langChainMessages,
-                llmType,
-                logger,
-                request: skeletonRequest,
-                kbResource: ESQL_RESOURCE,
-              })
-            );
+            const connectorName =
+              getConnectorName(connectorId, connectors) ?? '[unknown connector]';
+            const detailedRunName = `${runName} - ${connectorName} + ${agentName}`;
+            agents.push({
+              agentEvaluator: (langChainMessages, exampleId) =>
+                AGENT_EXECUTOR_MAP[agentName]({
+                  actions,
+                  connectorId,
+                  esClient,
+                  elserId,
+                  langChainMessages,
+                  llmType,
+                  logger,
+                  request: skeletonRequest,
+                  kbResource: ESQL_RESOURCE,
+                  traceOptions: {
+                    exampleId,
+                    projectName,
+                    runName: detailedRunName,
+                    evaluationId,
+                    tags: [
+                      'security-assistant-prediction',
+                      ...(connectorName != null ? [connectorName] : []),
+                      runName,
+                    ],
+                    tracers: getLangSmithTracer(detailedRunName, exampleId, logger),
+                  },
+                }),
+              metadata: {
+                connectorName,
+                runName: detailedRunName,
+              },
+            });
          });
        });
-
        logger.info(`Agents created: ${agents.length}`);

-        const evaluatorModel = new ActionsClientLlm({
-          actions,
-          connectorId: evalModel,
-          request: skeletonRequest,
-          logger,
-        });
+        // Evaluator Model is optional to support just running predictions
+        const evaluatorModel =
+          evalModel == null || evalModel === ''
+            ? undefined
+            : new ActionsClientLlm({
+                actions,
+                connectorId: evalModel,
+                request: skeletonRequest,
+                logger,
+              });

        const { evaluationResults, evaluationSummary } = await performEvaluation({
          agentExecutorEvaluators: agents,
@ -150,6 +183,7 @@ export const postEvaluateRoute = (
          evaluationPrompt: evalPrompt,
          evaluationType,
          logger,
+          runName,
        });

        logger.info(`Writing evaluation results to index: ${outputIndex}`);
@ -163,14 +197,15 @@ export const postEvaluateRoute = (
        });

        return response.ok({
-          body: { success: true },
+          body: { evaluationId, success: true },
        });
      } catch (err) {
        logger.error(err);
        const error = transformError(err);

+        const resp = buildResponse(response);
        return resp.error({
-          body: error.message,
+          body: { success: false, error: error.message },
          statusCode: error.statusCode,
        });
      }
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
@ -5,8 +5,14 @@
 * 2.0.
 */

+import { Client } from 'langsmith';
 import { OpenAiProviderType } from '@kbn/stack-connectors-plugin/common/openai/constants';
 import type { ActionResult } from '@kbn/actions-plugin/server';
+import type { Logger } from '@kbn/core/server';
+import type { Run } from 'langsmith/schemas';
+import { ToolingLog } from '@kbn/tooling-log';
+import { LangChainTracer } from 'langchain/callbacks';
+import { Dataset } from '../../schemas/evaluate/post_evaluate';

 /**
 * Returns the LangChain `llmType` for the given connectorId/connectors
@ -29,3 +35,133 @@ export const getLlmType = (connectorId: string, connectors: ActionResult[]): str

  return undefined;
 };
+
+/**
+ * Return connector name for the given connectorId/connectors
+ *
+ * @param connectorId
+ * @param connectors
+ */
+export const getConnectorName = (
+  connectorId: string,
+  connectors: ActionResult[]
+): string | undefined => {
+  return connectors.find((c) => c.id === connectorId)?.name;
+};
+
+/**
+ * Fetches a dataset from LangSmith. Note that `client` will use env vars
+ *
+ * @param datasetName
+ * @param logger
+ */
+export const fetchLangSmithDataset = async (
+  datasetName: string | undefined,
+  logger: Logger
+): Promise<Dataset> => {
+  if (datasetName === undefined || !isLangSmithEnabled()) {
+    throw new Error('LangSmith dataset name not provided or LangSmith not enabled');
+  }
+
+  try {
+    const client = new Client();
+
+    const examples = [];
+    for await (const example of client.listExamples({ datasetName })) {
+      examples.push(example);
+    }
+
+    // Convert to internal Dataset type -- TODO: add generic support for the different LangSmith test dataset formats
+    const dataset: Dataset = examples.map((example) => ({
+      id: example.id,
+      input: example.inputs.input as string,
+      reference: (example.outputs?.output as string) ?? '',
+      tags: [], // TODO: Consider adding tags from example data, e.g.: `datasetId:${example.dataset_id}`, `exampleName:${example.name}`
+      prediction: undefined,
+    }));
+
+    return dataset;
+  } catch (e) {
+    logger.error(`Error fetching dataset from LangSmith: ${e.message}`);
+    return [];
+  }
+};
+
+/**
+ * Write Feedback to LangSmith for a given Run
+ *
+ * @param run
+ * @param evaluationId
+ * @param logger
+ */
+export const writeLangSmithFeedback = async (
+  run: Run,
+  evaluationId: string,
+  logger: Logger | ToolingLog
+): Promise<string> => {
+  try {
+    const client = new Client();
+    const feedback = {
+      score: run.feedback_stats?.score,
+      value: run.feedback_stats?.value,
+      correction: run.feedback_stats?.correction,
+      comment: run.feedback_stats?.comment,
+      sourceInfo: run.feedback_stats?.sourceInfo,
+      feedbackSourceType: run.feedback_stats?.feedbackSourceType,
+      sourceRunId: run.feedback_stats?.sourceRunId,
+      feedbackId: run.feedback_stats?.feedbackId,
+      eager: run.feedback_stats?.eager,
+    };
+    await client.createFeedback(run.id, evaluationId, feedback);
+    const runUrl = await client.getRunUrl({ run });
+    return runUrl;
+  } catch (e) {
+    logger.error(`Error writing feedback to LangSmith: ${e.message}`);
+    return '';
+  }
+};
+
+/**
+ * Returns a custom LangChainTracer which adds the `exampleId` so Dataset 'Test' runs are written to LangSmith
+ * If `exampleId` is present (and a corresponding example exists in LangSmith) trace is written to the Dataset's `Tests`
+ * section, otherwise it is written to the `Project` provided
+ *
+ * @param projectName Name of project to trace results to
+ * @param exampleId Dataset exampleId to associate trace with
+ * @param logger
+ */
+export const getLangSmithTracer = (
+  projectName: string | undefined,
+  exampleId: string | undefined,
+  logger: Logger | ToolingLog
+): LangChainTracer[] => {
+  try {
+    if (!isLangSmithEnabled()) {
+      return [];
+    }
+    const lcTracer = new LangChainTracer({
+      projectName: projectName ?? 'default', // Shows as the 'test' run's 'name' in langsmith ui
+      exampleId,
+    });
+
+    return [lcTracer];
+  } catch (e) {
+    // Note: creating a tracer can fail if the LangSmith env vars are not set correctly
+    logger.error(`Error creating LangSmith tracer: ${e.message}`);
+  }
+
+  return [];
+};
+
+/**
+ * Returns true if LangSmith/tracing is enabled
+ */
+export const isLangSmithEnabled = (): boolean => {
+  try {
+    // Just checking if apiKey is available, if better way to check for enabled that is not env var please update
+    const config = Client.getDefaultClientConfig();
+    return config.apiKey != null;
+  } catch (e) {
+    return false;
+  }
+};
--- a/x-pack/plugins/elastic_assistant/server/schemas/evaluate/post_evaluate.ts
+++ b/x-pack/plugins/elastic_assistant/server/schemas/evaluate/post_evaluate.ts
@ -26,16 +26,21 @@ const outputIndex = new t.Type<string, string, unknown>(
 /** Validates the URL path of a POST request to the `/evaluate` endpoint */
 export const PostEvaluatePathQuery = t.type({
  agents: t.string,
-  evaluationType: t.string,
-  evalModel: t.string,
-  outputIndex,
+  datasetName: t.union([t.string, t.undefined]),
+  evaluationType: t.union([t.string, t.undefined]),
+  evalModel: t.union([t.string, t.undefined]),
  models: t.string,
+  outputIndex,
+  projectName: t.union([t.string, t.undefined]),
+  runName: t.union([t.string, t.undefined]),
 });

 export type DatasetItem = t.TypeOf<typeof DatasetItem>;
 export const DatasetItem = t.type({
+  id: t.union([t.string, t.undefined]),
  input: t.string,
  reference: t.string,
+  tags: t.union([t.array(t.string), t.undefined]),
  prediction: t.union([t.string, t.undefined]),
 });

@ -44,7 +49,7 @@ export const Dataset = t.array(DatasetItem);

 /** Validates the body of a POST request to the `/evaluate` endpoint */
 export const PostEvaluateBody = t.type({
-  dataset: Dataset,
+  dataset: t.union([Dataset, t.undefined]),
  evalPrompt: t.union([t.string, t.undefined]),
 });

--- a/x-pack/plugins/elastic_assistant/tsconfig.json
+++ b/x-pack/plugins/elastic_assistant/tsconfig.json
@ -29,9 +29,9 @@
    "@kbn/tooling-log",
    "@kbn/core-elasticsearch-server",
    "@kbn/logging",
-    "@kbn/std",
    "@kbn/stack-connectors-plugin",
    "@kbn/ml-plugin",
+    "@kbn/apm-utils",
  ],
  "exclude": [
    "target/**/*",
--- a/x-pack/plugins/security_solution/public/assistant/comment_actions/index.tsx
+++ b/x-pack/plugins/security_solution/public/assistant/comment_actions/index.tsx
@ -12,13 +12,14 @@ import React, { useCallback } from 'react';
 import { useDispatch } from 'react-redux';

 import { useAssistantContext } from '@kbn/elastic-assistant/impl/assistant_context';
-import { useKibana, useToasts } from '../../common/lib/kibana';
+import { useBasePath, useKibana, useToasts } from '../../common/lib/kibana';
 import type { Note } from '../../common/lib/note';
 import { appActions } from '../../common/store/actions';
 import { TimelineId } from '../../../common/types';
 import { updateAndAssociateNode } from '../../timelines/components/notes/helpers';
 import { timelineActions } from '../../timelines/store/timeline';
 import * as i18n from './translations';
+import { useIsExperimentalFeatureEnabled } from '../../common/hooks/use_experimental_features';

 interface Props {
  message: Message;
@ -26,8 +27,10 @@ interface Props {

 const CommentActionsComponent: React.FC<Props> = ({ message }) => {
  const toasts = useToasts();
+  const basePath = useBasePath();
  const { cases } = useKibana().services;
  const dispatch = useDispatch();
+  const isModelEvaluationEnabled = useIsExperimentalFeatureEnabled('assistantModelEvaluation');

  const { showAssistantOverlay } = useAssistantContext();

@ -75,8 +78,39 @@ const CommentActionsComponent: React.FC<Props> = ({ message }) => {
    });
  }, [content, selectCaseModal, showAssistantOverlay]);

+  // Note: This feature is behind the `isModelEvaluationEnabled` FF. If ever released, this URL should be configurable
+  // as APM data may not go to the same cluster where the Kibana instance is running
+  // Links to the experimental trace explorer page
+  // Note: There's a bug with URL params being rewritten, so must specify 'query' to filter on transaction id
+  // See: https://github.com/elastic/kibana/issues/171368
+  const apmTraceLink =
+    message.traceData != null
+      ? `${basePath}/app/apm/traces/explorer/waterfall?comparisonEnabled=false&detailTab=timeline&environment=ENVIRONMENT_ALL&kuery=&query=transaction.id:%20${message.traceData.transactionId}&rangeFrom=now-1y/d&rangeTo=now&showCriticalPath=false&traceId=${message.traceData.traceId}&transactionId=${message.traceData.transactionId}&type=kql&waterfallItemId=`
+      : undefined;
+
+  // Use this link for routing to the services/transactions view which provides a slightly different view
+  // const apmTraceLink =
+  //     message.traceData != null
+  //         ? `${basePath}/app/apm/services/kibana/transactions/view?kuery=&rangeFrom=now-1y&rangeTo=now&environment=ENVIRONMENT_ALL&serviceGroup=&comparisonEnabled=true&traceId=${message.traceData.traceId}&transactionId=${message.traceData.transactionId}&transactionName=POST%20/internal/elastic_assistant/actions/connector/?/_execute&transactionType=request&offset=1d&latencyAggregationType=avg`
+  //         : undefined;
+
  return (
+    // APM Trace support is currently behind the Model Evaluation feature flag until wider testing is performed
    <EuiFlexGroup alignItems="center" gutterSize="none">
+      {isModelEvaluationEnabled && apmTraceLink != null && (
+        <EuiFlexItem grow={false}>
+          <EuiToolTip position="top" content={i18n.VIEW_APM_TRACE}>
+            <EuiButtonIcon
+              aria-label={i18n.VIEW_APM_TRACE}
+              color="primary"
+              iconType="apmTrace"
+              href={apmTraceLink}
+              target={'_blank'}
+            />
+          </EuiToolTip>
+        </EuiFlexItem>
+      )}
+
      <EuiFlexItem grow={false}>
        <EuiToolTip position="top" content={i18n.ADD_NOTE_TO_TIMELINE}>
          <EuiButtonIcon
--- a/x-pack/plugins/security_solution/public/assistant/comment_actions/translations.ts
+++ b/x-pack/plugins/security_solution/public/assistant/comment_actions/translations.ts
@ -14,6 +14,13 @@ export const ADDED_NOTE_TO_TIMELINE = i18n.translate(
  }
 );

+export const VIEW_APM_TRACE = i18n.translate(
+  'xpack.securitySolution.assistant.commentActions.viewAPMTraceLabel',
+  {
+    defaultMessage: 'View APM Trace for this message',
+  }
+);
+
 export const ADD_MESSAGE_CONTENT_AS_TIMELINE_NOTE = i18n.translate(
  'xpack.securitySolution.assistant.commentActions.addMessageContentAsTimelineNoteAriaLabel',
  {
--- a/x-pack/plugins/translations/translations/fr-FR.json
+++ b/x-pack/plugins/translations/translations/fr-FR.json
@ -12842,7 +12842,6 @@
    "xpack.dataVisualizer.table.expandRowScreenMsg": "Développer la ligne",
    "xpack.dataVisualizer.title": "Charger un fichier",
    "xpack.elasticAssistant.assistant.connectors.connectorMissingCallout.calloutDescription": "Sélectionnez un connecteur ci-dessus ou depuis {link} pour continuer",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText": "Anecdotes : Surveillez les logs de serveur Kibana pour visualiser la progression et {funFacts} pour afficher les résultats dans Discover une fois l'opération terminée. Cette opération prendra (plusieurs) minutes, selon la quantité de données. Si vous fermez cette fenêtre, vous interromprez l'évaluation.",
    "xpack.elasticAssistant.assistant.settings.knowledgeBasedSettings.knowledgeBaseDescription": "Configurez ELSER dans {machineLearning} pour commencer. {seeDocs}",
    "xpack.elasticAssistant.assistant.settings.knowledgeBaseSettings.knowledgeBaseInstalledDescription": "Initialisé sur \"{kbIndexPattern}\"",
    "xpack.elasticAssistant.assistant.technicalPreview.tooltipContent": "Les réponses des systèmes d'IA ne sont pas toujours tout à fait exactes. Pour en savoir plus sur la fonctionnalité d'assistant et son utilisation, consultez {documentationLink}.",
@ -12970,7 +12969,6 @@
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel": "Invite d'évaluation",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription": "Type d'évaluation à effectuer, par exemple \"correctness\" \"esql-validator\" ou \"custom\", et fournit votre propre invite d'évaluation",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel": "Type d'évaluation",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetDescription": "Exemple d'ensemble de données à évaluer. Tableau avec des objets aux propriétés \"input\" (entrée) et \"references\" (références)",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel": "Ensemble de données",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText": "cliquez ici",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription": "Modèle avec lequel effectuer l'évaluation finale",
--- a/x-pack/plugins/translations/translations/ja-JP.json
+++ b/x-pack/plugins/translations/translations/ja-JP.json
@ -12855,7 +12855,6 @@
    "xpack.dataVisualizer.table.expandRowScreenMsg": "行を展開",
    "xpack.dataVisualizer.title": "ファイルをアップロード",
    "xpack.elasticAssistant.assistant.connectors.connectorMissingCallout.calloutDescription": "上または{link}からコネクターを選択して続行します。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText": "興味深い事実：Kibanaサーバーのログで進行状況を確認し、完了したら{funFacts}でDiscoverに結果を表示します。データセットによっては（何分も）かかります。このダイアログを閉じると評価はキャンセルされます！",
    "xpack.elasticAssistant.assistant.settings.knowledgeBasedSettings.knowledgeBaseDescription": "始めるには、{machineLearning}でELSERを設定してください。{seeDocs}",
    "xpack.elasticAssistant.assistant.settings.knowledgeBaseSettings.knowledgeBaseInstalledDescription": "`{kbIndexPattern}`に初期化しました",
    "xpack.elasticAssistant.assistant.technicalPreview.tooltipContent": "AIシステムからの応答は、必ずしも完全に正確であるとは限りません。アシスタント機能とその使用方法の詳細については、{documentationLink}を参照してください。",
@ -12983,7 +12982,6 @@
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel": "評価プロンプト",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription": "実行する評価のタイプ（例：\"correctness\" \"esql-validator\"、または\"custom\"）。独自の評価プロンプトを指定します",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel": "評価タイプ",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetDescription": "評価するサンプルデータセット。\"input\"プロパティと\"references\"プロパティを含むオブジェクトの配列",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel": "データセット",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText": "ここをクリック",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription": "で最終評価を実行するモデル",
--- a/x-pack/plugins/translations/translations/zh-CN.json
+++ b/x-pack/plugins/translations/translations/zh-CN.json
@ -12855,7 +12855,6 @@
    "xpack.dataVisualizer.table.expandRowScreenMsg": "展开行",
    "xpack.dataVisualizer.title": "上传文件",
    "xpack.elasticAssistant.assistant.connectors.connectorMissingCallout.calloutDescription": "在上方或从 {link} 中选择连接器以继续",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText": "有趣的事实：一旦完成，请查看 Kibana 服务器日志以了解进度，并 {funFacts} 以在 Discover 中查看结果。将花费（许多）分钟，具体取决于数据集，而且，关闭此对话框将取消评估！",
    "xpack.elasticAssistant.assistant.settings.knowledgeBasedSettings.knowledgeBaseDescription": "在 {machineLearning} 中配置 ELSER 以开始。{seeDocs}",
    "xpack.elasticAssistant.assistant.settings.knowledgeBaseSettings.knowledgeBaseInstalledDescription": "已初始化为 `{kbIndexPattern}`",
    "xpack.elasticAssistant.assistant.technicalPreview.tooltipContent": "来自 AI 系统的响应可能不会始终完全准确。有关辅助功能及其用法的详细信息，请参阅 {documentationLink}。",
@ -12983,7 +12982,6 @@
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel": "评估提示",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription": "要执行的评估类型，如“正确性”、“esql 验证器”或“定制”，并提供您自己的评估提示",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel": "评估类型",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetDescription": "要评估的样例数据集。具有“input”和“references”属性的对象数组",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel": "数据集",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText": "单击此处",
    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription": "要执行最后评估的模型",
--- a/yarn.lock
+++ b/yarn.lock
@ -20716,10 +20716,10 @@ kuler@^2.0.0:
  resolved "https://registry.yarnpkg.com/kuler/-/kuler-2.0.0.tgz#e2c570a3800388fb44407e851531c1d670b061b3"
  integrity sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==

-langchain@^0.0.151:
-  version "0.0.151"
-  resolved "https://registry.yarnpkg.com/langchain/-/langchain-0.0.151.tgz#10c1ebdda0d772e49dbca755ada6307c9280f601"
-  integrity sha512-RA7/ELK5dqUgv5glIP5Wm5JmbnrjH/eeROYdKGDGaDUNZrRJ2CLuEu+oJH7hcE5hpPoPlkLBCs/vz4hvr/YtYw==
+langchain@^0.0.186:
+  version "0.0.186"
+  resolved "https://registry.yarnpkg.com/langchain/-/langchain-0.0.186.tgz#59753972764d7ee4cf3e00dca1d74c95659bccbd"
+  integrity sha512-uXDipmw9aUrUmDNcFr2XH9ORmshWIlIb/qFKneS1K3X5upMUg7TSbaBxqV9WxuuenLUSYaoTcTy7P/pKkbqXPg==
  dependencies:
    "@anthropic-ai/sdk" "^0.6.2"
    ansi-styles "^5.0.0"
@ -20732,16 +20732,15 @@ langchain@^0.0.151:
    js-yaml "^4.1.0"
    jsonpointer "^5.0.1"
    langchainhub "~0.0.6"
-    langsmith "~0.0.31"
+    langsmith "~0.0.48"
    ml-distance "^4.0.0"
-    object-hash "^3.0.0"
-    openai "~4.4.0"
+    openai "^4.17.0"
    openapi-types "^12.1.3"
    p-queue "^6.6.2"
    p-retry "4"
    uuid "^9.0.0"
    yaml "^2.2.1"
-    zod "^3.21.4"
+    zod "^3.22.3"
    zod-to-json-schema "^3.20.4"

 langchainhub@~0.0.6:
@ -20749,10 +20748,10 @@ langchainhub@~0.0.6:
  resolved "https://registry.yarnpkg.com/langchainhub/-/langchainhub-0.0.6.tgz#9d2d06e4ce0807b4e8a31e19611f57aef990b54d"
  integrity sha512-SW6105T+YP1cTe0yMf//7kyshCgvCTyFBMTgH2H3s9rTAR4e+78DA/BBrUL/Mt4Q5eMWui7iGuAYb3pgGsdQ9w==

-langsmith@~0.0.31:
-  version "0.0.33"
-  resolved "https://registry.yarnpkg.com/langsmith/-/langsmith-0.0.33.tgz#0b8b0a7b9981777f37df86748892e417bdf94aea"
-  integrity sha512-8dVBjJsuIwsnUFtA6OJ85k2wWzpka+LsF2EFzpzpF3yOHO/Ui7oeCMobyp6L7QcgWIBdRUIJY6sNSxAW0uAMHg==
+langsmith@^0.0.48, langsmith@~0.0.48:
+  version "0.0.48"
+  resolved "https://registry.yarnpkg.com/langsmith/-/langsmith-0.0.48.tgz#3a9a8ce257271ddb43d01ebf585c4370a3a3ba79"
+  integrity sha512-s0hW8iZ90Q9XLTnDK0Pgee245URV3b1cXQjPDj5OKm1+KN7iSK1pKx+4CO7RcFLz58Ixe7Mt+mVcomYqUuryxQ==
  dependencies:
    "@types/uuid" "^9.0.1"
    commander "^10.0.1"
@ -23174,11 +23173,6 @@ object-hash@^1.3.0, object-hash@^1.3.1:
  resolved "https://registry.yarnpkg.com/object-hash/-/object-hash-1.3.1.tgz#fde452098a951cb145f039bb7d455449ddc126df"
  integrity sha512-OSuu/pU4ENM9kmREg0BdNrUDIl1heYa4mBZacJc+vVWz4GtAwu7jO8s4AIt2aGRUTqxykpWzI3Oqnsm13tTMDA==

-object-hash@^3.0.0:
-  version "3.0.0"
-  resolved "https://registry.yarnpkg.com/object-hash/-/object-hash-3.0.0.tgz#73f97f753e7baffc0e2cc9d6e079079744ac82e9"
-  integrity sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==
-
 object-identity-map@^1.0.2:
  version "1.0.2"
  resolved "https://registry.yarnpkg.com/object-identity-map/-/object-identity-map-1.0.2.tgz#2b4213a4285ca3a8cd2e696782c9964f887524e7"
@ -23385,10 +23379,10 @@ openai@^3.3.0:
    axios "^0.26.0"
    form-data "^4.0.0"

-openai@~4.4.0:
-  version "4.4.0"
-  resolved "https://registry.yarnpkg.com/openai/-/openai-4.4.0.tgz#dbaab326eb044ddec479951b245850c482678031"
-  integrity sha512-JN0t628Kh95T0IrXl0HdBqnlJg+4Vq0Bnh55tio+dfCnyzHvMLiWyCM9m726MAJD2YkDU4/8RQB6rNbEq9ct2w==
+openai@^4.17.0:
+  version "4.17.5"
+  resolved "https://registry.yarnpkg.com/openai/-/openai-4.17.5.tgz#096655741965656ec969731e97d4bce880112d66"
+  integrity sha512-SDgA933/QOjISCgWRc/JQhY1HweYZ6FOie3bWrCpj09FA5xIlaomldbyzICHNjtkh7SWEmGYFjRHIDtuwr+eTw==
  dependencies:
    "@types/node" "^18.11.18"
    "@types/node-fetch" "^2.6.4"
@ -23398,6 +23392,7 @@ openai@~4.4.0:
    form-data-encoder "1.7.2"
    formdata-node "^4.3.2"
    node-fetch "^2.6.7"
+    web-streams-polyfill "^3.2.1"

 openapi-types@^10.0.0:
  version "10.0.0"
@ -31377,7 +31372,7 @@ zod-to-json-schema@^3.20.4:
  resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.21.4.tgz#de97c5b6d4a25e9d444618486cb55c0c7fb949fd"
  integrity sha512-fjUZh4nQ1s6HMccgIeE0VP4QG/YRGPmyjO9sAh890aQKPEk3nqbfUXhMFaC+Dr5KvYBm8BCyvfpZf2jY9aGSsw==

-zod@^3.21.4, zod@^3.22.3:
+zod@^3.22.3:
  version "3.22.3"
  resolved "https://registry.yarnpkg.com/zod/-/zod-3.22.3.tgz#2fbc96118b174290d94e8896371c95629e87a060"
  integrity sha512-EjIevzuJRiRPbVH4mGc8nApb/lVLKVpmUhAaR5R5doKGfAnGJ6Gr3CViAVjP+4FWSxCsybeWQdcgCtbX+7oZug==