chore: set deployment id for azure embedding (#1322)

* chore: fix audit

* chore: update audit config

* chore: fix azure embedding

* chore: adjust ai config

* fix: do not generate embedding when all chunk content is empty
This commit is contained in:
Nathan.fooo 2025-04-07 14:44:22 +08:00 committed by GitHub
parent 2c1c820e68
commit 3181b17d60
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 115 additions and 56 deletions

8
Cargo.lock generated
View file

@ -4594,9 +4594,9 @@ dependencies = [
[[package]] [[package]]
name = "openssl" name = "openssl"
version = "0.10.66" version = "0.10.72"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da"
dependencies = [ dependencies = [
"bitflags 2.6.0", "bitflags 2.6.0",
"cfg-if", "cfg-if",
@ -4635,9 +4635,9 @@ dependencies = [
[[package]] [[package]]
name = "openssl-sys" name = "openssl-sys"
version = "0.9.103" version = "0.9.107"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" checksum = "8288979acd84749c744a9014b4382d42b8f7b2592847b5afb2ed29e5d16ede07"
dependencies = [ dependencies = [
"cc", "cc",
"libc", "libc",

View file

@ -1,2 +1,6 @@
[advisories] [advisories]
ignore = ["RUSTSEC-2024-0384"] ignore = [
"RUSTSEC-2024-0384",
"RUSTSEC-2025-0012",
"RUSTSEC-2024-0436",
]

View file

@ -174,7 +174,17 @@ NGINX_PORT=80
NGINX_TLS_PORT=443 NGINX_TLS_PORT=443
# AppFlowy AI # AppFlowy AI
# Standard OpenAI API:
# Set your API key here if you are using the standard OpenAI API.
AI_OPENAI_API_KEY= AI_OPENAI_API_KEY=
# Azure-hosted OpenAI API:
# If you're using a self-hosted OpenAI API via Azure, leave AI_OPENAI_API_KEY empty
# and set the following Azure-specific variables instead. If both are set, the standard OpenAI API will be used.
AI_AZURE_OPENAI_API_KEY=
AI_AZURE_OPENAI_API_BASE=
AI_AZURE_OPENAI_API_VERSION=
AI_ANTHROPIC_API_KEY= AI_ANTHROPIC_API_KEY=
AI_SERVER_PORT=5001 AI_SERVER_PORT=5001
AI_SERVER_HOST=ai AI_SERVER_HOST=ai

10
dev.env
View file

@ -117,7 +117,17 @@ GF_SECURITY_ADMIN_PASSWORD=password
CLOUDFLARE_TUNNEL_TOKEN= CLOUDFLARE_TUNNEL_TOKEN=
# AppFlowy AI # AppFlowy AI
# Standard OpenAI API:
# Set your API key here if you are using the standard OpenAI API.
AI_OPENAI_API_KEY= AI_OPENAI_API_KEY=
# Azure-hosted OpenAI API:
# If you're using a self-hosted OpenAI API via Azure, leave AI_OPENAI_API_KEY empty
# and set the following Azure-specific variables instead. If both are set, the standard OpenAI API will be used.
AI_AZURE_OPENAI_API_KEY=
AI_AZURE_OPENAI_API_BASE=
AI_AZURE_OPENAI_API_VERSION=
AI_ANTHROPIC_API_KEY= AI_ANTHROPIC_API_KEY=
AI_SERVER_PORT=5001 AI_SERVER_PORT=5001
AI_SERVER_HOST=localhost AI_SERVER_HOST=localhost

View file

@ -231,6 +231,15 @@ pub enum EmbeddingModel {
} }
impl EmbeddingModel { impl EmbeddingModel {
/// Returns the default embedding model used in this system.
///
/// This model is hardcoded and used to generate embeddings whose dimensions are
/// reflected in the PostgreSQL database schema. Changing the default model may
/// require a migration to create a new table with the appropriate dimensions.
pub fn default_model() -> Self {
EmbeddingModel::TextEmbedding3Small
}
pub fn supported_models() -> &'static [&'static str] { pub fn supported_models() -> &'static [&'static str] {
&[ &[
"text-embedding-ada-002", "text-embedding-ada-002",

View file

@ -9,5 +9,6 @@ edition = "2021"
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
lazy_static = "1.4.0" lazy_static = "1.4.0"
# can not upgrade to 9.3.1, it's not campatible with gotrue token
jsonwebtoken = "8.3.0" jsonwebtoken = "8.3.0"
app-error = { workspace = true, features = ["gotrue_error"] } app-error = { workspace = true, features = ["gotrue_error"] }

View file

@ -11,7 +11,7 @@ use collab_document::document::DocumentBody;
use collab_entity::CollabType; use collab_entity::CollabType;
use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings, EmbeddingContentType}; use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings, EmbeddingContentType};
use serde_json::json; use serde_json::json;
use tracing::{debug, trace}; use tracing::{debug, trace, warn};
use twox_hash::xxhash64::Hasher; use twox_hash::xxhash64::Hasher;
use uuid::Uuid; use uuid::Uuid;
@ -42,6 +42,14 @@ impl Indexer for DocumentIndexer {
paragraphs: Vec<String>, paragraphs: Vec<String>,
model: EmbeddingModel, model: EmbeddingModel,
) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> { ) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> {
if paragraphs.is_empty() {
warn!(
"[Embedding] No paragraphs found in document `{}`. Skipping embedding.",
object_id
);
return Ok(vec![]);
}
split_text_into_chunks(object_id, paragraphs, CollabType::Document, model) split_text_into_chunks(object_id, paragraphs, CollabType::Document, model)
} }
@ -63,7 +71,7 @@ impl Indexer for DocumentIndexer {
.model(embedder.model().name()) .model(embedder.model().name())
.input(EmbeddingInput::StringArray(contents)) .input(EmbeddingInput::StringArray(contents))
.encoding_format(EncodingFormat::Float) .encoding_format(EncodingFormat::Float)
.dimensions(EmbeddingModel::TextEmbedding3Small.default_dimensions()) .dimensions(EmbeddingModel::default_model().default_dimensions())
.build() .build()
.map_err(|err| AppError::Unhandled(err.to_string()))?; .map_err(|err| AppError::Unhandled(err.to_string()))?;

View file

@ -73,7 +73,7 @@ impl EmbeddingMetrics {
} }
pub fn record_gen_embedding_time(&self, num: u32, millis: u128) { pub fn record_gen_embedding_time(&self, num: u32, millis: u128) {
tracing::info!("[Embedding]: index {} collabs cost: {}ms", num, millis); tracing::trace!("[Embedding]: index {} collabs cost: {}ms", num, millis);
self.gen_embeddings_time_histogram.observe(millis as f64); self.gen_embeddings_time_histogram.observe(millis as f64);
} }
} }

View file

@ -15,6 +15,7 @@ use database::index::{
get_collab_embedding_fragment_ids, update_collab_indexed_at, upsert_collab_embeddings, get_collab_embedding_fragment_ids, update_collab_indexed_at, upsert_collab_embeddings,
}; };
use database::workspace::select_workspace_settings; use database::workspace::select_workspace_settings;
use database_entity::dto::AFCollabEmbeddedChunk;
use infra::env_util::get_env_var; use infra::env_util::get_env_var;
use redis::aio::ConnectionManager; use redis::aio::ConnectionManager;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -355,8 +356,9 @@ async fn generate_embeddings_loop(
} }
} }
} }
join_set.spawn(async move { join_set.spawn(async move {
if chunks.is_empty() { if is_collab_embedded_chunks_empty(&chunks) {
return Ok(None); return Ok(None);
} }
@ -398,7 +400,9 @@ async fn generate_embeddings_loop(
error!("Failed to send embedding record: {}", err); error!("Failed to send embedding record: {}", err);
} }
}, },
Ok(None) => debug!("No embedding for collab"), Ok(None) => trace!(
"[Embedding] Did found existing embeddings. Skip generate embedding for collab"
),
Err(err) => { Err(err) => {
metrics.record_failed_embed_count(1); metrics.record_failed_embed_count(1);
warn!( warn!(
@ -429,7 +433,7 @@ pub async fn spawn_pg_write_embeddings(
break; break;
} }
trace!("[Embedding] received {} embeddings to write", n); trace!("[Embedding] pg received {} embeddings to write", n);
let start = Instant::now(); let start = Instant::now();
let records = buf.drain(..n).collect::<Vec<_>>(); let records = buf.drain(..n).collect::<Vec<_>>();
for record in records.iter() { for record in records.iter() {
@ -541,3 +545,9 @@ impl UnindexedData {
} }
} }
} }
#[inline]
/// All chunks are empty if all of them have no content
pub fn is_collab_embedded_chunks_empty(chunks: &[AFCollabEmbeddedChunk]) -> bool {
chunks.iter().all(|chunk| chunk.content.is_none())
}

View file

@ -8,6 +8,7 @@ pub use async_openai::types::{
EncodingFormat, EncodingFormat,
}; };
use infra::env_util::get_env_var_opt; use infra::env_util::get_env_var_opt;
use tracing::{info, warn};
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub enum AFEmbedder { pub enum AFEmbedder {
@ -27,15 +28,31 @@ impl AFEmbedder {
} }
pub fn model(&self) -> EmbeddingModel { pub fn model(&self) -> EmbeddingModel {
EmbeddingModel::TextEmbedding3Small EmbeddingModel::default_model()
} }
} }
pub fn open_ai_config() -> Option<OpenAIConfig> { pub fn get_open_ai_config() -> (Option<OpenAIConfig>, Option<AzureConfig>) {
let open_ai_config = open_ai_config();
let azure_ai_config = azure_open_ai_config();
if open_ai_config.is_some() {
info!("Using official OpenAI API");
if azure_ai_config.is_some() {
warn!("Both OpenAI and Azure OpenAI API keys are set. Using OpenAI API.");
}
return (open_ai_config, None);
}
info!("Using Azure OpenAI API");
(None, azure_ai_config)
}
fn open_ai_config() -> Option<OpenAIConfig> {
get_env_var_opt("AI_OPENAI_API_KEY").map(|v| OpenAIConfig::default().with_api_key(v)) get_env_var_opt("AI_OPENAI_API_KEY").map(|v| OpenAIConfig::default().with_api_key(v))
} }
pub fn azure_open_ai_config() -> Option<AzureConfig> { fn azure_open_ai_config() -> Option<AzureConfig> {
let azure_open_ai_api_key = get_env_var_opt("AI_AZURE_OPENAI_API_KEY")?; let azure_open_ai_api_key = get_env_var_opt("AI_AZURE_OPENAI_API_KEY")?;
let azure_open_ai_api_base = get_env_var_opt("AI_AZURE_OPENAI_API_BASE")?; let azure_open_ai_api_base = get_env_var_opt("AI_AZURE_OPENAI_API_BASE")?;
let azure_open_ai_api_version = get_env_var_opt("AI_AZURE_OPENAI_API_VERSION")?; let azure_open_ai_api_version = get_env_var_opt("AI_AZURE_OPENAI_API_VERSION")?;

View file

@ -1,8 +1,10 @@
use app_error::AppError; use app_error::AppError;
use appflowy_ai_client::dto::EmbeddingModel;
use async_openai::config::{AzureConfig, Config, OpenAIConfig}; use async_openai::config::{AzureConfig, Config, OpenAIConfig};
use async_openai::types::{CreateEmbeddingRequest, CreateEmbeddingResponse}; use async_openai::types::{CreateEmbeddingRequest, CreateEmbeddingResponse};
use async_openai::Client; use async_openai::Client;
use tiktoken_rs::CoreBPE; use tiktoken_rs::CoreBPE;
use tracing::trace;
pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings"; pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings";
@ -27,7 +29,9 @@ pub struct AzureOpenAIEmbedder {
} }
impl AzureOpenAIEmbedder { impl AzureOpenAIEmbedder {
pub fn new(config: AzureConfig) -> Self { pub fn new(mut config: AzureConfig) -> Self {
// Make sure your Azure AI service support the model
config = config.with_deployment_id(EmbeddingModel::default_model().to_string());
let client = Client::with_config(config); let client = Client::with_config(config);
Self { client } Self { client }
} }
@ -37,6 +41,12 @@ pub async fn async_embed<C: Config>(
client: &Client<C>, client: &Client<C>,
request: CreateEmbeddingRequest, request: CreateEmbeddingRequest,
) -> Result<CreateEmbeddingResponse, AppError> { ) -> Result<CreateEmbeddingResponse, AppError> {
trace!(
"async embed with request: model:{:?}, dimension:{:?}, api_base:{}",
request.model,
request.dimensions,
client.config().api_base()
);
let response = client let response = client
.embeddings() .embeddings()
.create(request) .create(request)

View file

@ -21,7 +21,7 @@ use axum::response::IntoResponse;
use axum::routing::get; use axum::routing::get;
use indexer::metrics::EmbeddingMetrics; use indexer::metrics::EmbeddingMetrics;
use indexer::thread_pool::ThreadPoolNoAbortBuilder; use indexer::thread_pool::ThreadPoolNoAbortBuilder;
use indexer::vector::embedder::{azure_open_ai_config, open_ai_config}; use indexer::vector::embedder::get_open_ai_config;
use infra::env_util::get_env_var; use infra::env_util::get_env_var;
use mailer::sender::Mailer; use mailer::sender::Mailer;
use secrecy::ExposeSecret; use secrecy::ExposeSecret;
@ -132,9 +132,7 @@ pub async fn create_app(listener: TcpListener, config: Config) -> Result<(), Err
.unwrap(), .unwrap(),
); );
let open_ai_config = open_ai_config(); let (open_ai_config, azure_ai_config) = get_open_ai_config();
let azure_ai_config = azure_open_ai_config();
let indexer_config = BackgroundIndexerConfig { let indexer_config = BackgroundIndexerConfig {
enable: appflowy_collaborate::config::get_env_var("APPFLOWY_INDEXER_ENABLED", "true") enable: appflowy_collaborate::config::get_env_var("APPFLOWY_INDEXER_ENABLED", "true")
.parse::<bool>() .parse::<bool>()

View file

@ -8,7 +8,9 @@ use indexer::queue::{
ack_task, default_indexer_group_option, ensure_indexer_consumer_group, ack_task, default_indexer_group_option, ensure_indexer_consumer_group,
read_background_embed_tasks, read_background_embed_tasks,
}; };
use indexer::scheduler::{spawn_pg_write_embeddings, UnindexedCollabTask, UnindexedData}; use indexer::scheduler::{
is_collab_embedded_chunks_empty, spawn_pg_write_embeddings, UnindexedCollabTask, UnindexedData,
};
use indexer::thread_pool::ThreadPoolNoAbort; use indexer::thread_pool::ThreadPoolNoAbort;
use indexer::vector::embedder::{AFEmbedder, AzureConfig, OpenAIConfig}; use indexer::vector::embedder::{AFEmbedder, AzureConfig, OpenAIConfig};
use indexer::vector::open_ai; use indexer::vector::open_ai;
@ -195,14 +197,18 @@ async fn process_upcoming_tasks(
} }
} }
join_set.spawn(async move { join_set.spawn(async move {
let embeddings = indexer.embed(&embedder, chunks).await.ok()?; if is_collab_embedded_chunks_empty(&chunks) {
embeddings.map(|embeddings| EmbeddingRecord { return Ok::<_, AppError>(None);
}
let embeddings = indexer.embed(&embedder, chunks).await?;
Ok(embeddings.map(|embeddings| EmbeddingRecord {
workspace_id: task.workspace_id, workspace_id: task.workspace_id,
object_id: task.object_id, object_id: task.object_id,
collab_type: task.collab_type, collab_type: task.collab_type,
tokens_used: embeddings.tokens_consumed, tokens_used: embeddings.tokens_consumed,
contents: embeddings.params, contents: embeddings.params,
}) }))
}); });
} }
} }
@ -210,8 +216,11 @@ async fn process_upcoming_tasks(
while let Some(Ok(result)) = join_set.join_next().await { while let Some(Ok(result)) = join_set.join_next().await {
match result { match result {
None => metrics.record_failed_embed_count(1), Err(_) => {
Some(record) => { metrics.record_failed_embed_count(1);
},
Ok(None) => {},
Ok(Some(record)) => {
metrics.record_embed_count(1); metrics.record_embed_count(1);
trace!( trace!(
"[Background Embedding] send {} embedding record to write task", "[Background Embedding] send {} embedding record to write task",

View file

@ -45,7 +45,7 @@ use collab_stream::stream_router::{StreamRouter, StreamRouterOptions};
use database::file::s3_client_impl::{AwsS3BucketClientImpl, S3BucketStorage}; use database::file::s3_client_impl::{AwsS3BucketClientImpl, S3BucketStorage};
use indexer::collab_indexer::IndexerProvider; use indexer::collab_indexer::IndexerProvider;
use indexer::scheduler::{IndexerConfiguration, IndexerScheduler}; use indexer::scheduler::{IndexerConfiguration, IndexerScheduler};
use indexer::vector::embedder::{azure_open_ai_config, open_ai_config}; use indexer::vector::embedder::get_open_ai_config;
use infra::env_util::get_env_var; use infra::env_util::get_env_var;
use mailer::sender::Mailer; use mailer::sender::Mailer;
use snowflake::Snowflake; use snowflake::Snowflake;
@ -299,8 +299,7 @@ pub async fn init_state(config: &Config, rt_cmd_tx: CLCommandSender) -> Result<A
let mailer = get_mailer(&config.mailer).await?; let mailer = get_mailer(&config.mailer).await?;
info!("Setting up Indexer scheduler..."); info!("Setting up Indexer scheduler...");
let open_ai_config = open_ai_config(); let (open_ai_config, azure_ai_config) = get_open_ai_config();
let azure_ai_config = azure_open_ai_config();
let embedder_config = IndexerConfiguration { let embedder_config = IndexerConfiguration {
enable: get_env_var("APPFLOWY_INDEXER_ENABLED", "true") enable: get_env_var("APPFLOWY_INDEXER_ENABLED", "true")
.parse::<bool>() .parse::<bool>()

View file

@ -80,10 +80,10 @@ pub async fn search_document(
metrics: &RequestMetrics, metrics: &RequestMetrics,
) -> Result<Vec<SearchDocumentResponseItem>, AppError> { ) -> Result<Vec<SearchDocumentResponseItem>, AppError> {
let embeddings_request = CreateEmbeddingRequestArgs::default() let embeddings_request = CreateEmbeddingRequestArgs::default()
.model(EmbeddingModel::TextEmbedding3Small.to_string()) .model(EmbeddingModel::default_model().to_string())
.input(EmbeddingInput::String(request.query.clone())) .input(EmbeddingInput::String(request.query.clone()))
.encoding_format(EncodingFormat::Float) .encoding_format(EncodingFormat::Float)
.dimensions(EmbeddingModel::TextEmbedding3Small.default_dimensions()) .dimensions(EmbeddingModel::default_model().default_dimensions())
.build() .build()
.map_err(|err| AppError::Unhandled(err.to_string()))?; .map_err(|err| AppError::Unhandled(err.to_string()))?;

View file

@ -1,26 +0,0 @@
use client_api_test::{local_ai_test_enabled, TestClient};
#[tokio::test]
async fn get_local_ai_config_test() {
if !local_ai_test_enabled() {
return;
}
let test_client = TestClient::new_user().await;
let workspace_id = test_client.workspace_id().await;
let config = test_client
.api_client
.get_local_ai_config(&workspace_id, "macos")
.await
.unwrap();
{
assert!(!config.models.is_empty());
assert!(!config.models[0].embedding_model.download_url.is_empty());
assert!(config.models[0].embedding_model.file_size > 10);
assert!(!config.models[0].chat_model.download_url.is_empty());
assert!(config.models[0].chat_model.file_size > 10);
assert!(!config.plugin.version.is_empty());
assert!(!config.plugin.url.is_empty());
println!("config: {:?}", config);
}
}