chore: set deployment id for azure embedding (#1322)

* chore: fix audit * chore: update audit config * chore: fix azure embedding * chore: adjust ai config * fix: do not generate embedding when all chunk content is empty
2025-04-17 18:44:42 -04:00 · 2025-04-07 14:44:22 +08:00 · 2025-04-07 14:44:22 +08:00 · 3181b17d60
commit 3181b17d60
parent 2c1c820e68
16 changed files with 115 additions and 56 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4594,9 +4594,9 @@ dependencies = [

 [[package]]
 name = "openssl"
-version = "0.10.66"
+version = "0.10.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
+checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da"
 dependencies = [
 "bitflags 2.6.0",
 "cfg-if",
@ -4635,9 +4635,9 @@ dependencies = [

 [[package]]
 name = "openssl-sys"
-version = "0.9.103"
+version = "0.9.107"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
+checksum = "8288979acd84749c744a9014b4382d42b8f7b2592847b5afb2ed29e5d16ede07"
 dependencies = [
 "cc",
 "libc",
--- a/deny.toml
+++ b/deny.toml
@ -1,2 +1,6 @@
 [advisories]
-ignore = ["RUSTSEC-2024-0384"]
+ignore = [
+  "RUSTSEC-2024-0384",
+  "RUSTSEC-2025-0012",
+  "RUSTSEC-2024-0436",
+]
--- a/deploy.env
+++ b/deploy.env
@ -174,7 +174,17 @@ NGINX_PORT=80
 NGINX_TLS_PORT=443

 # AppFlowy AI
+# Standard OpenAI API:
+# Set your API key here if you are using the standard OpenAI API.
 AI_OPENAI_API_KEY=
+
+# Azure-hosted OpenAI API:
+# If you're using a self-hosted OpenAI API via Azure, leave AI_OPENAI_API_KEY empty
+# and set the following Azure-specific variables instead. If both are set, the standard OpenAI API will be used.
+AI_AZURE_OPENAI_API_KEY=
+AI_AZURE_OPENAI_API_BASE=
+AI_AZURE_OPENAI_API_VERSION=
+
 AI_ANTHROPIC_API_KEY=
 AI_SERVER_PORT=5001
 AI_SERVER_HOST=ai
--- a/dev.env
+++ b/dev.env
@ -117,7 +117,17 @@ GF_SECURITY_ADMIN_PASSWORD=password
 CLOUDFLARE_TUNNEL_TOKEN=

 # AppFlowy AI
+# Standard OpenAI API:
+# Set your API key here if you are using the standard OpenAI API.
 AI_OPENAI_API_KEY=
+
+# Azure-hosted OpenAI API:
+# If you're using a self-hosted OpenAI API via Azure, leave AI_OPENAI_API_KEY empty
+# and set the following Azure-specific variables instead. If both are set, the standard OpenAI API will be used.
+AI_AZURE_OPENAI_API_KEY=
+AI_AZURE_OPENAI_API_BASE=
+AI_AZURE_OPENAI_API_VERSION=
+
 AI_ANTHROPIC_API_KEY=
 AI_SERVER_PORT=5001
 AI_SERVER_HOST=localhost
--- a/libs/appflowy-ai-client/src/dto.rs
+++ b/libs/appflowy-ai-client/src/dto.rs
@ -231,6 +231,15 @@ pub enum EmbeddingModel {
 }

 impl EmbeddingModel {
+  /// Returns the default embedding model used in this system.
+  ///
+  /// This model is hardcoded and used to generate embeddings whose dimensions are
+  /// reflected in the PostgreSQL database schema. Changing the default model may
+  /// require a migration to create a new table with the appropriate dimensions.
+  pub fn default_model() -> Self {
+    EmbeddingModel::TextEmbedding3Small
+  }
+
  pub fn supported_models() -> &'static [&'static str] {
    &[
      "text-embedding-ada-002",
--- a/libs/gotrue-entity/Cargo.toml
+++ b/libs/gotrue-entity/Cargo.toml
@ -9,5 +9,6 @@ edition = "2021"
 serde.workspace = true
 serde_json.workspace = true
 lazy_static = "1.4.0"
+# can not upgrade to 9.3.1, it's not campatible with gotrue token
 jsonwebtoken = "8.3.0"
 app-error = { workspace = true, features = ["gotrue_error"] }
--- a/libs/indexer/src/collab_indexer/document_indexer.rs
+++ b/libs/indexer/src/collab_indexer/document_indexer.rs
@ -11,7 +11,7 @@ use collab_document::document::DocumentBody;
 use collab_entity::CollabType;
 use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings, EmbeddingContentType};
 use serde_json::json;
-use tracing::{debug, trace};
+use tracing::{debug, trace, warn};
 use twox_hash::xxhash64::Hasher;
 use uuid::Uuid;

@ -42,6 +42,14 @@ impl Indexer for DocumentIndexer {
    paragraphs: Vec<String>,
    model: EmbeddingModel,
  ) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> {
+    if paragraphs.is_empty() {
+      warn!(
+        "[Embedding] No paragraphs found in document `{}`. Skipping embedding.",
+        object_id
+      );
+
+      return Ok(vec![]);
+    }
    split_text_into_chunks(object_id, paragraphs, CollabType::Document, model)
  }

@ -63,7 +71,7 @@ impl Indexer for DocumentIndexer {
      .model(embedder.model().name())
      .input(EmbeddingInput::StringArray(contents))
      .encoding_format(EncodingFormat::Float)
-      .dimensions(EmbeddingModel::TextEmbedding3Small.default_dimensions())
+      .dimensions(EmbeddingModel::default_model().default_dimensions())
      .build()
      .map_err(|err| AppError::Unhandled(err.to_string()))?;

--- a/libs/indexer/src/metrics.rs
+++ b/libs/indexer/src/metrics.rs
@ -73,7 +73,7 @@ impl EmbeddingMetrics {
  }

  pub fn record_gen_embedding_time(&self, num: u32, millis: u128) {
-    tracing::info!("[Embedding]: index {} collabs cost: {}ms", num, millis);
+    tracing::trace!("[Embedding]: index {} collabs cost: {}ms", num, millis);
    self.gen_embeddings_time_histogram.observe(millis as f64);
  }
 }
--- a/libs/indexer/src/scheduler.rs
+++ b/libs/indexer/src/scheduler.rs
@ -15,6 +15,7 @@ use database::index::{
  get_collab_embedding_fragment_ids, update_collab_indexed_at, upsert_collab_embeddings,
 };
 use database::workspace::select_workspace_settings;
+use database_entity::dto::AFCollabEmbeddedChunk;
 use infra::env_util::get_env_var;
 use redis::aio::ConnectionManager;
 use serde::{Deserialize, Serialize};
@ -355,8 +356,9 @@ async fn generate_embeddings_loop(
                    }
                  }
                }
+
                join_set.spawn(async move {
-                  if chunks.is_empty() {
+                  if is_collab_embedded_chunks_empty(&chunks) {
                    return Ok(None);
                  }

@ -398,7 +400,9 @@ async fn generate_embeddings_loop(
                error!("Failed to send embedding record: {}", err);
              }
            },
-            Ok(None) => debug!("No embedding for collab"),
+            Ok(None) => trace!(
+              "[Embedding] Did found existing embeddings. Skip generate embedding for collab"
+            ),
            Err(err) => {
              metrics.record_failed_embed_count(1);
              warn!(
@ -429,7 +433,7 @@ pub async fn spawn_pg_write_embeddings(
      break;
    }

-    trace!("[Embedding] received {} embeddings to write", n);
+    trace!("[Embedding] pg received {} embeddings to write", n);
    let start = Instant::now();
    let records = buf.drain(..n).collect::<Vec<_>>();
    for record in records.iter() {
@ -541,3 +545,9 @@ impl UnindexedData {
    }
  }
 }
+
+#[inline]
+/// All chunks are empty if all of them have no content
+pub fn is_collab_embedded_chunks_empty(chunks: &[AFCollabEmbeddedChunk]) -> bool {
+  chunks.iter().all(|chunk| chunk.content.is_none())
+}
--- a/libs/indexer/src/vector/embedder.rs
+++ b/libs/indexer/src/vector/embedder.rs
@ -8,6 +8,7 @@ pub use async_openai::types::{
  EncodingFormat,
 };
 use infra::env_util::get_env_var_opt;
+use tracing::{info, warn};

 #[derive(Debug, Clone)]
 pub enum AFEmbedder {
@ -27,15 +28,31 @@ impl AFEmbedder {
  }

  pub fn model(&self) -> EmbeddingModel {
-    EmbeddingModel::TextEmbedding3Small
+    EmbeddingModel::default_model()
  }
 }

-pub fn open_ai_config() -> Option<OpenAIConfig> {
+pub fn get_open_ai_config() -> (Option<OpenAIConfig>, Option<AzureConfig>) {
+  let open_ai_config = open_ai_config();
+  let azure_ai_config = azure_open_ai_config();
+
+  if open_ai_config.is_some() {
+    info!("Using official OpenAI API");
+    if azure_ai_config.is_some() {
+      warn!("Both OpenAI and Azure OpenAI API keys are set. Using OpenAI API.");
+    }
+    return (open_ai_config, None);
+  }
+
+  info!("Using Azure OpenAI API");
+  (None, azure_ai_config)
+}
+
+fn open_ai_config() -> Option<OpenAIConfig> {
  get_env_var_opt("AI_OPENAI_API_KEY").map(|v| OpenAIConfig::default().with_api_key(v))
 }

-pub fn azure_open_ai_config() -> Option<AzureConfig> {
+fn azure_open_ai_config() -> Option<AzureConfig> {
  let azure_open_ai_api_key = get_env_var_opt("AI_AZURE_OPENAI_API_KEY")?;
  let azure_open_ai_api_base = get_env_var_opt("AI_AZURE_OPENAI_API_BASE")?;
  let azure_open_ai_api_version = get_env_var_opt("AI_AZURE_OPENAI_API_VERSION")?;
--- a/libs/indexer/src/vector/open_ai.rs
+++ b/libs/indexer/src/vector/open_ai.rs
@ -1,8 +1,10 @@
 use app_error::AppError;
+use appflowy_ai_client::dto::EmbeddingModel;
 use async_openai::config::{AzureConfig, Config, OpenAIConfig};
 use async_openai::types::{CreateEmbeddingRequest, CreateEmbeddingResponse};
 use async_openai::Client;
 use tiktoken_rs::CoreBPE;
+use tracing::trace;

 pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings";

@ -27,7 +29,9 @@ pub struct AzureOpenAIEmbedder {
 }

 impl AzureOpenAIEmbedder {
-  pub fn new(config: AzureConfig) -> Self {
+  pub fn new(mut config: AzureConfig) -> Self {
+    // Make sure your Azure AI service support the model
+    config = config.with_deployment_id(EmbeddingModel::default_model().to_string());
    let client = Client::with_config(config);
    Self { client }
  }
@ -37,6 +41,12 @@ pub async fn async_embed<C: Config>(
  client: &Client<C>,
  request: CreateEmbeddingRequest,
 ) -> Result<CreateEmbeddingResponse, AppError> {
+  trace!(
+    "async embed with request: model:{:?}, dimension:{:?}, api_base:{}",
+    request.model,
+    request.dimensions,
+    client.config().api_base()
+  );
  let response = client
    .embeddings()
    .create(request)
--- a/services/appflowy-worker/src/application.rs
+++ b/services/appflowy-worker/src/application.rs
@ -21,7 +21,7 @@ use axum::response::IntoResponse;
 use axum::routing::get;
 use indexer::metrics::EmbeddingMetrics;
 use indexer::thread_pool::ThreadPoolNoAbortBuilder;
-use indexer::vector::embedder::{azure_open_ai_config, open_ai_config};
+use indexer::vector::embedder::get_open_ai_config;
 use infra::env_util::get_env_var;
 use mailer::sender::Mailer;
 use secrecy::ExposeSecret;
@ -132,9 +132,7 @@ pub async fn create_app(listener: TcpListener, config: Config) -> Result<(), Err
      .unwrap(),
  );

-  let open_ai_config = open_ai_config();
-  let azure_ai_config = azure_open_ai_config();
-
+  let (open_ai_config, azure_ai_config) = get_open_ai_config();
  let indexer_config = BackgroundIndexerConfig {
    enable: appflowy_collaborate::config::get_env_var("APPFLOWY_INDEXER_ENABLED", "true")
      .parse::<bool>()
--- a/services/appflowy-worker/src/indexer_worker/worker.rs
+++ b/services/appflowy-worker/src/indexer_worker/worker.rs
@ -8,7 +8,9 @@ use indexer::queue::{
  ack_task, default_indexer_group_option, ensure_indexer_consumer_group,
  read_background_embed_tasks,
 };
-use indexer::scheduler::{spawn_pg_write_embeddings, UnindexedCollabTask, UnindexedData};
+use indexer::scheduler::{
+  is_collab_embedded_chunks_empty, spawn_pg_write_embeddings, UnindexedCollabTask, UnindexedData,
+};
 use indexer::thread_pool::ThreadPoolNoAbort;
 use indexer::vector::embedder::{AFEmbedder, AzureConfig, OpenAIConfig};
 use indexer::vector::open_ai;
@ -195,14 +197,18 @@ async fn process_upcoming_tasks(
                  }
                }
                join_set.spawn(async move {
-                  let embeddings = indexer.embed(&embedder, chunks).await.ok()?;
-                  embeddings.map(|embeddings| EmbeddingRecord {
+                  if is_collab_embedded_chunks_empty(&chunks) {
+                    return Ok::<_, AppError>(None);
+                  }
+
+                  let embeddings = indexer.embed(&embedder, chunks).await?;
+                  Ok(embeddings.map(|embeddings| EmbeddingRecord {
                    workspace_id: task.workspace_id,
                    object_id: task.object_id,
                    collab_type: task.collab_type,
                    tokens_used: embeddings.tokens_consumed,
                    contents: embeddings.params,
-                  })
+                  }))
                });
              }
            }
@ -210,8 +216,11 @@ async fn process_upcoming_tasks(

          while let Some(Ok(result)) = join_set.join_next().await {
            match result {
-              None => metrics.record_failed_embed_count(1),
-              Some(record) => {
+              Err(_) => {
+                metrics.record_failed_embed_count(1);
+              },
+              Ok(None) => {},
+              Ok(Some(record)) => {
                metrics.record_embed_count(1);
                trace!(
                  "[Background Embedding] send {} embedding record to write task",
--- a/src/application.rs
+++ b/src/application.rs
@ -45,7 +45,7 @@ use collab_stream::stream_router::{StreamRouter, StreamRouterOptions};
 use database::file::s3_client_impl::{AwsS3BucketClientImpl, S3BucketStorage};
 use indexer::collab_indexer::IndexerProvider;
 use indexer::scheduler::{IndexerConfiguration, IndexerScheduler};
-use indexer::vector::embedder::{azure_open_ai_config, open_ai_config};
+use indexer::vector::embedder::get_open_ai_config;
 use infra::env_util::get_env_var;
 use mailer::sender::Mailer;
 use snowflake::Snowflake;
@ -299,8 +299,7 @@ pub async fn init_state(config: &Config, rt_cmd_tx: CLCommandSender) -> Result<A
  let mailer = get_mailer(&config.mailer).await?;

  info!("Setting up Indexer scheduler...");
-  let open_ai_config = open_ai_config();
-  let azure_ai_config = azure_open_ai_config();
+  let (open_ai_config, azure_ai_config) = get_open_ai_config();
  let embedder_config = IndexerConfiguration {
    enable: get_env_var("APPFLOWY_INDEXER_ENABLED", "true")
      .parse::<bool>()
--- a/src/biz/search/ops.rs
+++ b/src/biz/search/ops.rs
@ -80,10 +80,10 @@ pub async fn search_document(
  metrics: &RequestMetrics,
 ) -> Result<Vec<SearchDocumentResponseItem>, AppError> {
  let embeddings_request = CreateEmbeddingRequestArgs::default()
-    .model(EmbeddingModel::TextEmbedding3Small.to_string())
+    .model(EmbeddingModel::default_model().to_string())
    .input(EmbeddingInput::String(request.query.clone()))
    .encoding_format(EncodingFormat::Float)
-    .dimensions(EmbeddingModel::TextEmbedding3Small.default_dimensions())
+    .dimensions(EmbeddingModel::default_model().default_dimensions())
    .build()
    .map_err(|err| AppError::Unhandled(err.to_string()))?;

--- a/tests/ai_test/local_ai_test.rs
+++ b/tests/ai_test/local_ai_test.rs
@ -1,26 +0,0 @@
-use client_api_test::{local_ai_test_enabled, TestClient};
-
-#[tokio::test]
-async fn get_local_ai_config_test() {
-  if !local_ai_test_enabled() {
-    return;
-  }
-  let test_client = TestClient::new_user().await;
-  let workspace_id = test_client.workspace_id().await;
-  let config = test_client
-    .api_client
-    .get_local_ai_config(&workspace_id, "macos")
-    .await
-    .unwrap();
-  {
-    assert!(!config.models.is_empty());
-    assert!(!config.models[0].embedding_model.download_url.is_empty());
-    assert!(config.models[0].embedding_model.file_size > 10);
-    assert!(!config.models[0].chat_model.download_url.is_empty());
-    assert!(config.models[0].chat_model.file_size > 10);
-
-    assert!(!config.plugin.version.is_empty());
-    assert!(!config.plugin.url.is_empty());
-    println!("config: {:?}", config);
-  }
-}