Document paragraphs (#1119)

* chore: create embeddings by paragraphs

* chore: use document paragraphs method

* chore: document indexing by paragraphs with consistent hash

* chore: compare produced embeddings against existing ones

* chore: make pg stored proc compare between input and existing embedded fragments

* chore: missing sqlx generation

* fix: appflowy worker

* chore: make sure that embeddings are only changed when content had changed

* chore: remove partition key and recreate af_collab_embeddings_upsert migration

* chore: use pg15 on CI and update af_collab_embeddings table primary key

* chore: fix test

---------

Co-authored-by: Nathan <nathan@appflowy.io>
This commit is contained in:
Bartosz Sypytkowski 2025-04-06 11:47:02 +02:00 committed by GitHub
parent b94d1c60b3
commit 1fd900d994
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 601 additions and 408 deletions

View file

@ -0,0 +1,28 @@
{
"db_name": "PostgreSQL",
"query": "\n SELECT oid, fragment_id\n FROM af_collab_embeddings\n WHERE oid = ANY($1::uuid[])\n ",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "oid",
"type_info": "Uuid"
},
{
"ordinal": 1,
"name": "fragment_id",
"type_info": "Text"
}
],
"parameters": {
"Left": [
"UuidArray"
]
},
"nullable": [
false,
false
]
},
"hash": "90afca9cc8b6d4ca31e8ddf1ce466411b5034639df91b739f5cbe2af0ffb6811"
}

27
Cargo.lock generated
View file

@ -1888,7 +1888,7 @@ dependencies = [
[[package]] [[package]]
name = "collab" name = "collab"
version = "0.2.0" version = "0.2.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap", "arc-swap",
@ -1913,7 +1913,7 @@ dependencies = [
[[package]] [[package]]
name = "collab-database" name = "collab-database"
version = "0.2.0" version = "0.2.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
@ -1953,7 +1953,7 @@ dependencies = [
[[package]] [[package]]
name = "collab-document" name = "collab-document"
version = "0.2.0" version = "0.2.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap", "arc-swap",
@ -1974,7 +1974,7 @@ dependencies = [
[[package]] [[package]]
name = "collab-entity" name = "collab-entity"
version = "0.2.0" version = "0.2.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bytes", "bytes",
@ -1994,7 +1994,7 @@ dependencies = [
[[package]] [[package]]
name = "collab-folder" name = "collab-folder"
version = "0.2.0" version = "0.2.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap", "arc-swap",
@ -2016,7 +2016,7 @@ dependencies = [
[[package]] [[package]]
name = "collab-importer" name = "collab-importer"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-recursion", "async-recursion",
@ -2124,7 +2124,7 @@ dependencies = [
[[package]] [[package]]
name = "collab-user" name = "collab-user"
version = "0.2.0" version = "0.2.0"
source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=80d1c6147d1139289c2eaadab40557cc86c0f4b6#80d1c6147d1139289c2eaadab40557cc86c0f4b6" source = "git+https://github.com/AppFlowy-IO/AppFlowy-Collab?rev=3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b#3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"collab", "collab",
@ -3828,8 +3828,6 @@ dependencies = [
"collab", "collab",
"collab-document", "collab-document",
"collab-entity", "collab-entity",
"collab-folder",
"collab-stream",
"database", "database",
"database-entity", "database-entity",
"futures-util", "futures-util",
@ -3846,7 +3844,7 @@ dependencies = [
"tiktoken-rs", "tiktoken-rs",
"tokio", "tokio",
"tracing", "tracing",
"unicode-segmentation", "twox-hash",
"ureq", "ureq",
"uuid", "uuid",
] ]
@ -7412,6 +7410,15 @@ dependencies = [
"utf-8", "utf-8",
] ]
[[package]]
name = "twox-hash"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
dependencies = [
"rand 0.8.5",
]
[[package]] [[package]]
name = "typenum" name = "typenum"
version = "1.17.0" version = "1.17.0"

View file

@ -303,13 +303,13 @@ lto = false
[patch.crates-io] [patch.crates-io]
# It's diffcult to resovle different version with the same crate used in AppFlowy Frontend and the Client-API crate. # It's diffcult to resovle different version with the same crate used in AppFlowy Frontend and the Client-API crate.
# So using patch to workaround this issue. # So using patch to workaround this issue.
collab = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
collab-entity = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab-entity = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
collab-folder = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab-folder = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
collab-document = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab-document = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
collab-user = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab-user = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
collab-database = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab-database = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
collab-importer = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "80d1c6147d1139289c2eaadab40557cc86c0f4b6" } collab-importer = { git = "https://github.com/AppFlowy-IO/AppFlowy-Collab", rev = "3b1deca704cc1d8ae4fdc9cb053d7da824d0b85b" }
[features] [features]
history = [] history = []

View file

@ -35,7 +35,7 @@ services:
postgres: postgres:
restart: on-failure restart: on-failure
image: pgvector/pgvector:pg16 image: pgvector/pgvector:pg15
ports: ports:
- "5432:5432" - "5432:5432"
healthcheck: healthcheck:

View file

@ -21,7 +21,7 @@ services:
postgres: postgres:
restart: on-failure restart: on-failure
image: pgvector/pgvector:pg16 image: pgvector/pgvector:pg15
environment: environment:
- POSTGRES_USER=${POSTGRES_USER:-postgres} - POSTGRES_USER=${POSTGRES_USER:-postgres}
- POSTGRES_DB=${POSTGRES_DB:-postgres} - POSTGRES_DB=${POSTGRES_DB:-postgres}

View file

@ -283,7 +283,7 @@ pub struct EmbeddingRequest {
pub dimensions: i32, pub dimensions: i32,
} }
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum EmbeddingModel { pub enum EmbeddingModel {
#[serde(rename = "text-embedding-3-small")] #[serde(rename = "text-embedding-3-small")]
TextEmbedding3Small, TextEmbedding3Small,

View file

@ -762,7 +762,7 @@ pub struct AFCollabEmbeddedChunk {
#[serde(with = "uuid_str")] #[serde(with = "uuid_str")]
pub object_id: Uuid, pub object_id: Uuid,
pub content_type: EmbeddingContentType, pub content_type: EmbeddingContentType,
pub content: String, pub content: Option<String>,
pub embedding: Option<Vec<f32>>, pub embedding: Option<Vec<f32>>,
pub metadata: serde_json::Value, pub metadata: serde_json::Value,
pub fragment_index: i32, pub fragment_index: i32,

View file

@ -64,7 +64,7 @@ WHERE w.workspace_id = $1"#,
struct Fragment { struct Fragment {
fragment_id: String, fragment_id: String,
content_type: i32, content_type: i32,
contents: String, contents: Option<String>,
embedding: Option<Vector>, embedding: Option<Vector>,
metadata: serde_json::Value, metadata: serde_json::Value,
fragment_index: i32, fragment_index: i32,
@ -100,9 +100,13 @@ pub async fn upsert_collab_embeddings(
) -> Result<(), sqlx::Error> { ) -> Result<(), sqlx::Error> {
let fragments = records.into_iter().map(Fragment::from).collect::<Vec<_>>(); let fragments = records.into_iter().map(Fragment::from).collect::<Vec<_>>();
tracing::trace!( tracing::trace!(
"[Embedding] upsert {} {} fragments", "[Embedding] upsert {} {} fragments, fragment ids: {:?}",
object_id, object_id,
fragments.len() fragments.len(),
fragments
.iter()
.map(|v| v.fragment_id.clone())
.collect::<Vec<_>>()
); );
sqlx::query(r#"CALL af_collab_embeddings_upsert($1, $2, $3, $4::af_fragment_v3[])"#) sqlx::query(r#"CALL af_collab_embeddings_upsert($1, $2, $3, $4::af_fragment_v3[])"#)
.bind(*workspace_id) .bind(*workspace_id)
@ -114,6 +118,35 @@ pub async fn upsert_collab_embeddings(
Ok(()) Ok(())
} }
pub async fn get_collab_embedding_fragment_ids<'a, E>(
tx: E,
collab_ids: Vec<Uuid>,
) -> Result<HashMap<Uuid, Vec<String>>, sqlx::Error>
where
E: Executor<'a, Database = Postgres>,
{
let records = sqlx::query!(
r#"
SELECT oid, fragment_id
FROM af_collab_embeddings
WHERE oid = ANY($1::uuid[])
"#,
&collab_ids,
)
.fetch_all(tx)
.await?;
let mut fragment_ids_by_oid = HashMap::new();
for record in records {
// If your record.oid is not a String, convert it as needed.
fragment_ids_by_oid
.entry(record.oid)
.or_insert_with(Vec::new)
.push(record.fragment_id);
}
Ok(fragment_ids_by_oid)
}
pub async fn stream_collabs_without_embeddings( pub async fn stream_collabs_without_embeddings(
conn: &mut PoolConnection<Postgres>, conn: &mut PoolConnection<Postgres>,
workspace_id: Uuid, workspace_id: Uuid,

View file

@ -8,12 +8,9 @@ rayon.workspace = true
tiktoken-rs = "0.6.0" tiktoken-rs = "0.6.0"
app-error = { workspace = true } app-error = { workspace = true }
appflowy-ai-client = { workspace = true, features = ["client-api"] } appflowy-ai-client = { workspace = true, features = ["client-api"] }
unicode-segmentation = "1.12.0"
collab = { workspace = true } collab = { workspace = true }
collab-entity = { workspace = true } collab-entity = { workspace = true }
collab-folder = { workspace = true }
collab-document = { workspace = true } collab-document = { workspace = true }
collab-stream = { workspace = true }
database-entity.workspace = true database-entity.workspace = true
database.workspace = true database.workspace = true
futures-util.workspace = true futures-util.workspace = true
@ -37,3 +34,4 @@ redis = { workspace = true, features = [
] } ] }
secrecy = { workspace = true, features = ["serde"] } secrecy = { workspace = true, features = ["serde"] }
reqwest.workspace = true reqwest.workspace = true
twox-hash = { version = "2.1.0", features = ["xxhash64"] }

View file

@ -1,6 +1,6 @@
use crate::collab_indexer::Indexer; use crate::collab_indexer::Indexer;
use crate::vector::embedder::Embedder; use crate::vector::embedder::Embedder;
use crate::vector::open_ai::split_text_by_max_content_len; use crate::vector::open_ai::group_paragraphs_by_max_content_len;
use anyhow::anyhow; use anyhow::anyhow;
use app_error::AppError; use app_error::AppError;
use appflowy_ai_client::dto::{ use appflowy_ai_client::dto::{
@ -9,11 +9,11 @@ use appflowy_ai_client::dto::{
use async_trait::async_trait; use async_trait::async_trait;
use collab::preclude::Collab; use collab::preclude::Collab;
use collab_document::document::DocumentBody; use collab_document::document::DocumentBody;
use collab_document::error::DocumentError;
use collab_entity::CollabType; use collab_entity::CollabType;
use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings, EmbeddingContentType}; use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings, EmbeddingContentType};
use serde_json::json; use serde_json::json;
use tracing::trace; use tracing::{debug, trace};
use twox_hash::xxhash64::Hasher;
use uuid::Uuid; use uuid::Uuid;
pub struct DocumentIndexer; pub struct DocumentIndexer;
@ -23,7 +23,7 @@ impl Indexer for DocumentIndexer {
fn create_embedded_chunks_from_collab( fn create_embedded_chunks_from_collab(
&self, &self,
collab: &Collab, collab: &Collab,
embedding_model: EmbeddingModel, model: EmbeddingModel,
) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> { ) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> {
let object_id = collab.object_id().parse()?; let object_id = collab.object_id().parse()?;
let document = DocumentBody::from_collab(collab).ok_or_else(|| { let document = DocumentBody::from_collab(collab).ok_or_else(|| {
@ -33,29 +33,20 @@ impl Indexer for DocumentIndexer {
) )
})?; })?;
let result = document.to_plain_text(collab.transact(), false, true); let paragraphs = document.paragraphs(collab.transact());
match result { self.create_embedded_chunks_from_text(object_id, paragraphs, model)
Ok(content) => self.create_embedded_chunks_from_text(object_id, content, embedding_model),
Err(err) => {
if matches!(err, DocumentError::NoRequiredData) {
Ok(vec![])
} else {
Err(AppError::Internal(err.into()))
}
},
}
} }
fn create_embedded_chunks_from_text( fn create_embedded_chunks_from_text(
&self, &self,
object_id: Uuid, object_id: Uuid,
text: String, paragraphs: Vec<String>,
model: EmbeddingModel, model: EmbeddingModel,
) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> { ) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> {
split_text_into_chunks(object_id, text, CollabType::Document, &model) split_text_into_chunks(object_id, paragraphs, CollabType::Document, model)
} }
fn embed( async fn embed(
&self, &self,
embedder: &Embedder, embedder: &Embedder,
mut content: Vec<AFCollabEmbeddedChunk>, mut content: Vec<AFCollabEmbeddedChunk>,
@ -66,14 +57,16 @@ impl Indexer for DocumentIndexer {
let contents: Vec<_> = content let contents: Vec<_> = content
.iter() .iter()
.map(|fragment| fragment.content.clone()) .map(|fragment| fragment.content.clone().unwrap_or_default())
.collect(); .collect();
let resp = embedder.embed(EmbeddingRequest { let resp = embedder
input: EmbeddingInput::StringArray(contents), .async_embed(EmbeddingRequest {
model: embedder.model().name().to_string(), input: EmbeddingInput::StringArray(contents),
encoding_format: EmbeddingEncodingFormat::Float, model: embedder.model().name().to_string(),
dimensions: EmbeddingModel::TextEmbedding3Small.default_dimensions(), encoding_format: EmbeddingEncodingFormat::Float,
})?; dimensions: EmbeddingModel::TextEmbedding3Small.default_dimensions(),
})
.await?;
trace!( trace!(
"[Embedding] request {} embeddings, received {} embeddings", "[Embedding] request {} embeddings, received {} embeddings",
@ -83,15 +76,18 @@ impl Indexer for DocumentIndexer {
for embedding in resp.data { for embedding in resp.data {
let param = &mut content[embedding.index as usize]; let param = &mut content[embedding.index as usize];
let embedding: Vec<f32> = match embedding.embedding { if param.content.is_some() {
EmbeddingOutput::Float(embedding) => embedding.into_iter().map(|f| f as f32).collect(), // we only set the embedding if the content was not marked as unchanged
EmbeddingOutput::Base64(_) => { let embedding: Vec<f32> = match embedding.embedding {
return Err(AppError::OpenError( EmbeddingOutput::Float(embedding) => embedding.into_iter().map(|f| f as f32).collect(),
"Unexpected base64 encoding".to_string(), EmbeddingOutput::Base64(_) => {
)) return Err(AppError::OpenError(
}, "Unexpected base64 encoding".to_string(),
}; ))
param.embedding = Some(embedding); },
};
param.embedding = Some(embedding);
}
} }
Ok(Some(AFCollabEmbeddings { Ok(Some(AFCollabEmbeddings {
@ -100,39 +96,52 @@ impl Indexer for DocumentIndexer {
})) }))
} }
} }
fn split_text_into_chunks( fn split_text_into_chunks(
object_id: Uuid, object_id: Uuid,
content: String, paragraphs: Vec<String>,
collab_type: CollabType, collab_type: CollabType,
embedding_model: &EmbeddingModel, embedding_model: EmbeddingModel,
) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> { ) -> Result<Vec<AFCollabEmbeddedChunk>, AppError> {
debug_assert!(matches!( debug_assert!(matches!(
embedding_model, embedding_model,
EmbeddingModel::TextEmbedding3Small EmbeddingModel::TextEmbedding3Small
)); ));
if content.is_empty() { if paragraphs.is_empty() {
return Ok(vec![]); return Ok(vec![]);
} }
// We assume that every token is ~4 bytes. We're going to split document content into fragments // Group paragraphs into chunks of roughly 8000 characters.
// of ~2000 tokens each. let split_contents = group_paragraphs_by_max_content_len(paragraphs, 8000);
let split_contents = split_text_by_max_content_len(content, 8000)?; let metadata = json!({
let metadata = json!({"id": object_id.to_string(), "source": "appflowy", "name": "document", "collab_type": collab_type }); "id": object_id,
Ok( "source": "appflowy",
split_contents "name": "document",
.into_iter() "collab_type": collab_type
.enumerate() });
.map(|(index, content)| AFCollabEmbeddedChunk {
fragment_id: Uuid::new_v4().to_string(), let mut seen = std::collections::HashSet::new();
let mut chunks = Vec::new();
for (index, content) in split_contents.into_iter().enumerate() {
let consistent_hash = Hasher::oneshot(0, content.as_bytes());
let fragment_id = format!("{:x}", consistent_hash);
if seen.insert(fragment_id.clone()) {
chunks.push(AFCollabEmbeddedChunk {
fragment_id,
object_id, object_id,
content_type: EmbeddingContentType::PlainText, content_type: EmbeddingContentType::PlainText,
content, content: Some(content),
embedding: None, embedding: None,
metadata: metadata.clone(), metadata: metadata.clone(),
fragment_index: index as i32, fragment_index: index as i32,
embedded_type: 0, embedded_type: 0,
}) });
.collect(), } else {
) debug!(
"[Embedding] Duplicate fragment_id detected: {}. This fragment will not be added.",
fragment_id
);
}
}
Ok(chunks)
} }

View file

@ -2,6 +2,7 @@ use crate::collab_indexer::DocumentIndexer;
use crate::vector::embedder::Embedder; use crate::vector::embedder::Embedder;
use app_error::AppError; use app_error::AppError;
use appflowy_ai_client::dto::EmbeddingModel; use appflowy_ai_client::dto::EmbeddingModel;
use async_trait::async_trait;
use collab::preclude::Collab; use collab::preclude::Collab;
use collab_entity::CollabType; use collab_entity::CollabType;
use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings}; use database_entity::dto::{AFCollabEmbeddedChunk, AFCollabEmbeddings};
@ -11,6 +12,7 @@ use std::sync::Arc;
use tracing::info; use tracing::info;
use uuid::Uuid; use uuid::Uuid;
#[async_trait]
pub trait Indexer: Send + Sync { pub trait Indexer: Send + Sync {
fn create_embedded_chunks_from_collab( fn create_embedded_chunks_from_collab(
&self, &self,
@ -21,11 +23,11 @@ pub trait Indexer: Send + Sync {
fn create_embedded_chunks_from_text( fn create_embedded_chunks_from_text(
&self, &self,
object_id: Uuid, object_id: Uuid,
text: String, paragraphs: Vec<String>,
model: EmbeddingModel, model: EmbeddingModel,
) -> Result<Vec<AFCollabEmbeddedChunk>, AppError>; ) -> Result<Vec<AFCollabEmbeddedChunk>, AppError>;
fn embed( async fn embed(
&self, &self,
embedder: &Embedder, embedder: &Embedder,
content: Vec<AFCollabEmbeddedChunk>, content: Vec<AFCollabEmbeddedChunk>,

View file

@ -1,9 +1,7 @@
use crate::collab_indexer::{Indexer, IndexerProvider}; use crate::collab_indexer::IndexerProvider;
use crate::entity::EmbeddingRecord; use crate::entity::EmbeddingRecord;
use crate::error::IndexerError;
use crate::metrics::EmbeddingMetrics; use crate::metrics::EmbeddingMetrics;
use crate::queue::add_background_embed_task; use crate::queue::add_background_embed_task;
use crate::thread_pool::{ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
use crate::vector::embedder::Embedder; use crate::vector::embedder::Embedder;
use crate::vector::open_ai; use crate::vector::open_ai;
use app_error::AppError; use app_error::AppError;
@ -12,11 +10,11 @@ use collab::preclude::Collab;
use collab_document::document::DocumentBody; use collab_document::document::DocumentBody;
use collab_entity::CollabType; use collab_entity::CollabType;
use database::collab::CollabStorage; use database::collab::CollabStorage;
use database::index::{update_collab_indexed_at, upsert_collab_embeddings}; use database::index::{
get_collab_embedding_fragment_ids, update_collab_indexed_at, upsert_collab_embeddings,
};
use database::workspace::select_workspace_settings; use database::workspace::select_workspace_settings;
use database_entity::dto::AFCollabEmbeddedChunk;
use infra::env_util::get_env_var; use infra::env_util::get_env_var;
use rayon::prelude::*;
use redis::aio::ConnectionManager; use redis::aio::ConnectionManager;
use secrecy::{ExposeSecret, Secret}; use secrecy::{ExposeSecret, Secret};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -30,6 +28,7 @@ use tokio::sync::mpsc;
use tokio::sync::mpsc::error::TrySendError; use tokio::sync::mpsc::error::TrySendError;
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
use tokio::sync::RwLock as TokioRwLock; use tokio::sync::RwLock as TokioRwLock;
use tokio::task::JoinSet;
use tokio::time::timeout; use tokio::time::timeout;
use tracing::{debug, error, info, instrument, trace, warn}; use tracing::{debug, error, info, instrument, trace, warn};
use uuid::Uuid; use uuid::Uuid;
@ -38,7 +37,6 @@ pub struct IndexerScheduler {
pub(crate) indexer_provider: Arc<IndexerProvider>, pub(crate) indexer_provider: Arc<IndexerProvider>,
pub(crate) pg_pool: PgPool, pub(crate) pg_pool: PgPool,
pub(crate) storage: Arc<dyn CollabStorage>, pub(crate) storage: Arc<dyn CollabStorage>,
pub(crate) threads: Arc<ThreadPoolNoAbort>,
#[allow(dead_code)] #[allow(dead_code)]
pub(crate) metrics: Arc<EmbeddingMetrics>, pub(crate) metrics: Arc<EmbeddingMetrics>,
write_embedding_tx: UnboundedSender<EmbeddingRecord>, write_embedding_tx: UnboundedSender<EmbeddingRecord>,
@ -77,19 +75,11 @@ impl IndexerScheduler {
let (write_embedding_tx, write_embedding_rx) = unbounded_channel::<EmbeddingRecord>(); let (write_embedding_tx, write_embedding_rx) = unbounded_channel::<EmbeddingRecord>();
let (gen_embedding_tx, gen_embedding_rx) = let (gen_embedding_tx, gen_embedding_rx) =
mpsc::channel::<UnindexedCollabTask>(config.embedding_buffer_size); mpsc::channel::<UnindexedCollabTask>(config.embedding_buffer_size);
let threads = Arc::new(
ThreadPoolNoAbortBuilder::new()
.num_threads(num_thread)
.thread_name(|index| format!("create-embedding-thread-{index}"))
.build()
.unwrap(),
);
let this = Arc::new(Self { let this = Arc::new(Self {
indexer_provider, indexer_provider,
pg_pool, pg_pool,
storage, storage,
threads,
metrics, metrics,
write_embedding_tx, write_embedding_tx,
gen_embedding_tx, gen_embedding_tx,
@ -105,7 +95,7 @@ impl IndexerScheduler {
let latest_write_embedding_err = Arc::new(TokioRwLock::new(None)); let latest_write_embedding_err = Arc::new(TokioRwLock::new(None));
if this.index_enabled() { if this.index_enabled() {
tokio::spawn(spawn_rayon_generate_embeddings( tokio::spawn(generate_embeddings_loop(
gen_embedding_rx, gen_embedding_rx,
Arc::downgrade(&this), Arc::downgrade(&this),
num_thread, num_thread,
@ -258,18 +248,17 @@ impl IndexerScheduler {
CollabType::Document => { CollabType::Document => {
let txn = collab.transact(); let txn = collab.transact();
let text = DocumentBody::from_collab(collab) let text = DocumentBody::from_collab(collab)
.and_then(|body| body.to_plain_text(txn, false, true).ok()); .map(|body| body.paragraphs(txn))
.unwrap_or_default();
if let Some(text) = text { if !text.is_empty() {
if !text.is_empty() { let pending = UnindexedCollabTask::new(
let pending = UnindexedCollabTask::new( workspace_id,
workspace_id, object_id,
object_id, collab_type,
collab_type, UnindexedData::Paragraphs(text),
UnindexedData::Text(text), );
); self.embed_immediately(pending)?;
self.embed_immediately(pending)?;
}
} }
}, },
_ => { _ => {
@ -293,7 +282,7 @@ impl IndexerScheduler {
} }
} }
async fn spawn_rayon_generate_embeddings( async fn generate_embeddings_loop(
mut rx: mpsc::Receiver<UnindexedCollabTask>, mut rx: mpsc::Receiver<UnindexedCollabTask>,
scheduler: Weak<IndexerScheduler>, scheduler: Weak<IndexerScheduler>,
buffer_size: usize, buffer_size: usize,
@ -332,60 +321,99 @@ async fn spawn_rayon_generate_embeddings(
records.len() records.len()
); );
let metrics = scheduler.metrics.clone(); let metrics = scheduler.metrics.clone();
let threads = scheduler.threads.clone();
let indexer_provider = scheduler.indexer_provider.clone(); let indexer_provider = scheduler.indexer_provider.clone();
let write_embedding_tx = scheduler.write_embedding_tx.clone(); let write_embedding_tx = scheduler.write_embedding_tx.clone();
let embedder = scheduler.create_embedder(); let embedder = scheduler.create_embedder();
let result = tokio::task::spawn_blocking(move || { match embedder {
match embedder { Ok(embedder) => {
Ok(embedder) => { let params: Vec<_> = records.iter().map(|r| r.object_id).collect();
records.into_par_iter().for_each(|record| { let existing_embeddings =
let result = threads.install(|| { match get_collab_embedding_fragment_ids(&scheduler.pg_pool, params).await {
let indexer = indexer_provider.indexer_for(record.collab_type); Ok(existing_embeddings) => existing_embeddings,
match process_collab(&embedder, indexer, record.object_id, record.data, &metrics) { Err(err) => {
Ok(Some((tokens_used, contents))) => { error!("[Embedding] failed to get existing embeddings: {}", err);
if let Err(err) = write_embedding_tx.send(EmbeddingRecord { Default::default()
workspace_id: record.workspace_id, },
object_id: record.object_id, };
collab_type: record.collab_type, let mut join_set = JoinSet::new();
tokens_used, for record in records {
contents, if let Some(indexer) = indexer_provider.indexer_for(record.collab_type) {
}) { metrics.record_embed_count(1);
error!("Failed to send embedding record: {}", err); let paragraphs = match record.data {
UnindexedData::Paragraphs(paragraphs) => paragraphs,
UnindexedData::Text(text) => text.split('\n').map(|s| s.to_string()).collect(),
};
let embedder = embedder.clone();
match indexer.create_embedded_chunks_from_text(
record.object_id,
paragraphs,
embedder.model(),
) {
Ok(mut chunks) => {
if let Some(fragment_ids) = existing_embeddings.get(&record.object_id) {
for chunk in chunks.iter_mut() {
if fragment_ids.contains(&chunk.fragment_id) {
// we already had an embedding for this chunk
chunk.content = None;
chunk.embedding = None;
}
}
}
join_set.spawn(async move {
if chunks.is_empty() {
return Ok(None);
} }
},
Ok(None) => {
debug!("No embedding for collab:{}", record.object_id);
},
Err(err) => {
warn!(
"Failed to create embeddings content for collab:{}, error:{}",
record.object_id, err
);
},
}
});
if let Err(err) = result { let result = indexer.embed(&embedder, chunks).await;
error!("Failed to install a task to rayon thread pool: {}", err); match result {
Ok(Some(embeddings)) => {
let record = EmbeddingRecord {
workspace_id: record.workspace_id,
object_id: record.object_id,
collab_type: record.collab_type,
tokens_used: embeddings.tokens_consumed,
contents: embeddings.params,
};
Ok(Some(record))
},
Ok(None) => Ok(None),
Err(err) => Err(err),
}
});
},
Err(err) => {
metrics.record_failed_embed_count(1);
warn!(
"Failed to create embedded chunks for collab: {}, error:{}",
record.object_id, err
);
continue;
},
} }
}); }
}, }
Err(err) => error!("[Embedding] Failed to create embedder: {}", err), while let Some(Ok(res)) = join_set.join_next().await {
} scheduler
Ok::<_, IndexerError>(()) .metrics
}) .record_gen_embedding_time(n as u32, start.elapsed().as_millis());
.await; match res {
Ok(Some(record)) => {
match result { if let Err(err) = write_embedding_tx.send(record) {
Ok(Ok(_)) => { error!("Failed to send embedding record: {}", err);
scheduler }
.metrics },
.record_gen_embedding_time(n as u32, start.elapsed().as_millis()); Ok(None) => debug!("No embedding for collab"),
trace!("Successfully generated embeddings"); Err(err) => {
metrics.record_failed_embed_count(1);
warn!(
"Failed to create embeddings content for collab, error:{}",
err
);
},
}
}
}, },
Ok(Err(err)) => error!("Failed to generate embeddings: {}", err), Err(err) => error!("[Embedding] Failed to create embedder: {}", err),
Err(err) => error!("Failed to spawn a task to generate embeddings: {}", err),
} }
} }
} }
@ -409,7 +437,7 @@ pub async fn spawn_pg_write_embeddings(
let start = Instant::now(); let start = Instant::now();
let records = buf.drain(..n).collect::<Vec<_>>(); let records = buf.drain(..n).collect::<Vec<_>>();
for record in records.iter() { for record in records.iter() {
info!( debug!(
"[Embedding] generate collab:{} embeddings, tokens used: {}", "[Embedding] generate collab:{} embeddings, tokens used: {}",
record.object_id, record.tokens_used record.object_id, record.tokens_used
); );
@ -477,40 +505,6 @@ pub(crate) async fn batch_insert_records(
Ok(()) Ok(())
} }
/// This function must be called within the rayon thread pool.
fn process_collab(
embedder: &Embedder,
indexer: Option<Arc<dyn Indexer>>,
object_id: Uuid,
data: UnindexedData,
metrics: &EmbeddingMetrics,
) -> Result<Option<(u32, Vec<AFCollabEmbeddedChunk>)>, AppError> {
if let Some(indexer) = indexer {
let chunks = match data {
UnindexedData::Text(text) => {
indexer.create_embedded_chunks_from_text(object_id, text, embedder.model())?
},
};
if chunks.is_empty() {
return Ok(None);
}
metrics.record_embed_count(1);
let result = indexer.embed(embedder, chunks);
match result {
Ok(Some(embeddings)) => Ok(Some((embeddings.tokens_consumed, embeddings.params))),
Ok(None) => Ok(None),
Err(err) => {
metrics.record_failed_embed_count(1);
Err(err)
},
}
} else {
Ok(None)
}
}
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
pub struct UnindexedCollabTask { pub struct UnindexedCollabTask {
pub workspace_id: Uuid, pub workspace_id: Uuid,
@ -540,12 +534,14 @@ impl UnindexedCollabTask {
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
pub enum UnindexedData { pub enum UnindexedData {
Text(String), Text(String),
Paragraphs(Vec<String>),
} }
impl UnindexedData { impl UnindexedData {
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
match self { match self {
UnindexedData::Text(text) => text.is_empty(), UnindexedData::Text(text) => text.is_empty(),
UnindexedData::Paragraphs(text) => text.is_empty(),
} }
} }
} }

View file

@ -1,38 +1,31 @@
use crate::collab_indexer::IndexerProvider; use crate::collab_indexer::IndexerProvider;
use crate::entity::{EmbeddingRecord, UnindexedCollab}; use crate::entity::{EmbeddingRecord, UnindexedCollab};
use crate::scheduler::{batch_insert_records, IndexerScheduler}; use crate::scheduler::{batch_insert_records, IndexerScheduler};
use crate::thread_pool::ThreadPoolNoAbort;
use crate::vector::embedder::Embedder; use crate::vector::embedder::Embedder;
use appflowy_ai_client::dto::EmbeddingModel;
use collab::core::collab::DataSource; use collab::core::collab::DataSource;
use collab::core::origin::CollabOrigin; use collab::core::origin::CollabOrigin;
use collab::preclude::Collab; use collab::preclude::Collab;
use collab_entity::CollabType; use collab_entity::CollabType;
use database::collab::{CollabStorage, GetCollabOrigin}; use database::collab::{CollabStorage, GetCollabOrigin};
use database::index::stream_collabs_without_embeddings; use database::index::{get_collab_embedding_fragment_ids, stream_collabs_without_embeddings};
use futures_util::stream::BoxStream; use futures_util::stream::BoxStream;
use futures_util::StreamExt; use futures_util::StreamExt;
use rayon::iter::ParallelIterator; use rayon::iter::ParallelIterator;
use rayon::prelude::IntoParallelIterator; use rayon::prelude::IntoParallelIterator;
use sqlx::pool::PoolConnection; use sqlx::pool::PoolConnection;
use sqlx::Postgres; use sqlx::Postgres;
use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use tokio::task::JoinSet;
use tracing::{error, info, trace}; use tracing::{error, info, trace};
use uuid::Uuid; use uuid::Uuid;
#[allow(dead_code)] #[allow(dead_code)]
pub(crate) async fn index_workspace(scheduler: Arc<IndexerScheduler>, workspace_id: Uuid) { pub(crate) async fn index_workspace(scheduler: Arc<IndexerScheduler>, workspace_id: Uuid) {
let weak_threads = Arc::downgrade(&scheduler.threads);
let mut retry_delay = Duration::from_secs(2); let mut retry_delay = Duration::from_secs(2);
loop { loop {
let threads = match weak_threads.upgrade() {
Some(threads) => threads,
None => {
info!("[Embedding] thread pool is dropped, stop indexing");
break;
},
};
let conn = scheduler.pg_pool.try_acquire(); let conn = scheduler.pg_pool.try_acquire();
if conn.is_none() { if conn.is_none() {
tokio::time::sleep(retry_delay).await; tokio::time::sleep(retry_delay).await;
@ -58,23 +51,17 @@ pub(crate) async fn index_workspace(scheduler: Arc<IndexerScheduler>, workspace_
continue; continue;
} }
index_then_write_embedding_to_disk( index_then_write_embedding_to_disk(&scheduler, std::mem::take(&mut unindexed_collabs)).await;
&scheduler,
threads.clone(),
std::mem::take(&mut unindexed_collabs),
)
.await;
} }
if !unindexed_collabs.is_empty() { if !unindexed_collabs.is_empty() {
index_then_write_embedding_to_disk(&scheduler, threads.clone(), unindexed_collabs).await; index_then_write_embedding_to_disk(&scheduler, unindexed_collabs).await;
} }
} }
} }
async fn index_then_write_embedding_to_disk( async fn index_then_write_embedding_to_disk(
scheduler: &Arc<IndexerScheduler>, scheduler: &Arc<IndexerScheduler>,
threads: Arc<ThreadPoolNoAbort>,
unindexed_collabs: Vec<UnindexedCollab>, unindexed_collabs: Vec<UnindexedCollab>,
) { ) {
info!( info!(
@ -87,32 +74,41 @@ async fn index_then_write_embedding_to_disk(
if let Ok(embedder) = scheduler.create_embedder() { if let Ok(embedder) = scheduler.create_embedder() {
let start = Instant::now(); let start = Instant::now();
let embeddings = create_embeddings( let object_ids = unindexed_collabs
embedder, .iter()
&scheduler.indexer_provider, .map(|v| v.object_id)
threads.clone(), .collect::<Vec<_>>();
unindexed_collabs, match get_collab_embedding_fragment_ids(&scheduler.pg_pool, object_ids).await {
) Ok(existing_embeddings) => {
.await; let embeddings = create_embeddings(
scheduler embedder,
.metrics &scheduler.indexer_provider,
.record_gen_embedding_time(embeddings.len() as u32, start.elapsed().as_millis()); unindexed_collabs,
existing_embeddings,
)
.await;
scheduler
.metrics
.record_gen_embedding_time(embeddings.len() as u32, start.elapsed().as_millis());
let write_start = Instant::now(); let write_start = Instant::now();
let n = embeddings.len(); let n = embeddings.len();
match batch_insert_records(&scheduler.pg_pool, embeddings).await { match batch_insert_records(&scheduler.pg_pool, embeddings).await {
Ok(_) => trace!( Ok(_) => trace!(
"[Embedding] upsert {} embeddings success, cost:{}ms", "[Embedding] upsert {} embeddings success, cost:{}ms",
n, n,
write_start.elapsed().as_millis() write_start.elapsed().as_millis()
), ),
Err(err) => error!("{}", err), Err(err) => error!("{}", err),
}
scheduler
.metrics
.record_write_embedding_time(write_start.elapsed().as_millis());
tokio::time::sleep(Duration::from_secs(5)).await;
},
Err(err) => error!("[Embedding] failed to get fragment ids: {}", err),
} }
scheduler
.metrics
.record_write_embedding_time(write_start.elapsed().as_millis());
tokio::time::sleep(Duration::from_secs(5)).await;
} else { } else {
trace!("[Embedding] no embeddings to process in this batch"); trace!("[Embedding] no embeddings to process in this batch");
} }
@ -160,12 +156,61 @@ async fn stream_unindexed_collabs(
}) })
.boxed() .boxed()
} }
async fn create_embeddings( async fn create_embeddings(
embedder: Embedder, embedder: Embedder,
indexer_provider: &Arc<IndexerProvider>, indexer_provider: &Arc<IndexerProvider>,
threads: Arc<ThreadPoolNoAbort>,
unindexed_records: Vec<UnindexedCollab>, unindexed_records: Vec<UnindexedCollab>,
existing_embeddings: HashMap<Uuid, Vec<String>>,
) -> Vec<EmbeddingRecord> {
// 1. use parallel iteration since computing text chunks is CPU-intensive task
let records = compute_embedding_records(
indexer_provider,
embedder.model(),
unindexed_records,
existing_embeddings,
);
// 2. use tokio JoinSet to parallelize OpenAI calls (IO-bound)
let mut join_set = JoinSet::new();
for record in records {
let indexer_provider = indexer_provider.clone();
let embedder = embedder.clone();
if let Some(indexer) = indexer_provider.indexer_for(record.collab_type) {
join_set.spawn(async move {
match indexer.embed(&embedder, record.contents).await {
Ok(embeddings) => embeddings.map(|embeddings| EmbeddingRecord {
workspace_id: record.workspace_id,
object_id: record.object_id,
collab_type: record.collab_type,
tokens_used: embeddings.tokens_consumed,
contents: embeddings.params,
}),
Err(err) => {
error!("Failed to embed collab: {}", err);
None
},
}
});
}
}
let mut results = Vec::with_capacity(join_set.len());
while let Some(Ok(Some(record))) = join_set.join_next().await {
trace!(
"[Embedding] generate collab:{} embeddings, tokens used: {}",
record.object_id,
record.tokens_used
);
results.push(record);
}
results
}
fn compute_embedding_records(
indexer_provider: &IndexerProvider,
model: EmbeddingModel,
unindexed_records: Vec<UnindexedCollab>,
existing_embeddings: HashMap<Uuid, Vec<String>>,
) -> Vec<EmbeddingRecord> { ) -> Vec<EmbeddingRecord> {
unindexed_records unindexed_records
.into_par_iter() .into_par_iter()
@ -180,8 +225,8 @@ async fn create_embeddings(
) )
.ok()?; .ok()?;
let chunks = indexer let mut chunks = indexer
.create_embedded_chunks_from_collab(&collab, embedder.model()) .create_embedded_chunks_from_collab(&collab, model)
.ok()?; .ok()?;
if chunks.is_empty() { if chunks.is_empty() {
trace!("[Embedding] {} has no embeddings", unindexed.object_id,); trace!("[Embedding] {} has no embeddings", unindexed.object_id,);
@ -192,32 +237,23 @@ async fn create_embeddings(
)); ));
} }
let result = threads.install(|| match indexer.embed(&embedder, chunks) { // compare chunks against existing fragment ids (which are content addressed) and mark these
Ok(embeddings) => embeddings.map(|embeddings| EmbeddingRecord { // which haven't changed as already embedded
workspace_id: unindexed.workspace_id, if let Some(existing_embeddings) = existing_embeddings.get(&unindexed.object_id) {
object_id: unindexed.object_id, for chunk in chunks.iter_mut() {
collab_type: unindexed.collab_type, if existing_embeddings.contains(&chunk.fragment_id) {
tokens_used: embeddings.tokens_consumed, chunk.content = None; // mark as already embedded
contents: embeddings.params, chunk.embedding = None;
}), }
Err(err) => { }
error!("Failed to embed collab: {}", err);
None
},
});
if let Ok(Some(record)) = &result {
trace!(
"[Embedding] generate collab:{} embeddings, tokens used: {}",
record.object_id,
record.tokens_used
);
} }
Some(EmbeddingRecord {
result.unwrap_or_else(|err| { workspace_id: unindexed.workspace_id,
error!("Failed to spawn a task to index collab: {}", err); object_id: unindexed.object_id,
None collab_type: unindexed.collab_type,
tokens_used: 0,
contents: chunks,
}) })
}) })
.collect::<Vec<_>>() .collect()
} }

View file

@ -5,7 +5,6 @@ use appflowy_ai_client::dto::{EmbeddingRequest, OpenAIEmbeddingResponse};
use serde::de::DeserializeOwned; use serde::de::DeserializeOwned;
use std::time::Duration; use std::time::Duration;
use tiktoken_rs::CoreBPE; use tiktoken_rs::CoreBPE;
use unicode_segmentation::UnicodeSegmentation;
pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings"; pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings";
@ -184,53 +183,41 @@ pub fn split_text_by_max_tokens(
Ok(chunks) Ok(chunks)
} }
#[inline] pub fn group_paragraphs_by_max_content_len(
pub fn split_text_by_max_content_len( paragraphs: Vec<String>,
content: String,
max_content_len: usize, max_content_len: usize,
) -> Result<Vec<String>, AppError> { ) -> Vec<String> {
if content.is_empty() { if paragraphs.is_empty() {
return Ok(vec![]); return vec![];
} }
if content.len() <= max_content_len { let mut result = Vec::new();
return Ok(vec![content]); let mut current = String::new();
} for paragraph in paragraphs {
if paragraph.len() + current.len() > max_content_len {
// Content is longer than max_content_len; need to split // if we add the paragraph to the current content, it will exceed the limit
let mut result = Vec::with_capacity(1 + content.len() / max_content_len); // so we push the current content to the result set and start a new chunk
let mut fragment = String::with_capacity(max_content_len); let accumulated = std::mem::replace(&mut current, paragraph);
let mut current_len = 0; if !accumulated.is_empty() {
result.push(accumulated);
for grapheme in content.graphemes(true) {
let grapheme_len = grapheme.len();
if current_len + grapheme_len > max_content_len {
if !fragment.is_empty() {
result.push(std::mem::take(&mut fragment));
}
current_len = 0;
if grapheme_len > max_content_len {
// Push the grapheme as a fragment on its own
result.push(grapheme.to_string());
continue;
} }
} else {
// add the paragraph to the current chunk
current.push_str(&paragraph);
} }
fragment.push_str(grapheme);
current_len += grapheme_len;
} }
// Add the last fragment if it's not empty if !current.is_empty() {
if !fragment.is_empty() { result.push(current);
result.push(fragment);
} }
Ok(result)
result
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::vector::open_ai::{split_text_by_max_content_len, split_text_by_max_tokens}; use crate::vector::open_ai::{group_paragraphs_by_max_content_len, split_text_by_max_tokens};
use tiktoken_rs::cl100k_base; use tiktoken_rs::cl100k_base;
#[test] #[test]
@ -246,7 +233,7 @@ mod tests {
assert!(content.is_char_boundary(content.len())); assert!(content.is_char_boundary(content.len()));
} }
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(vec![content], max_tokens);
for content in params { for content in params {
assert!(content.is_char_boundary(0)); assert!(content.is_char_boundary(0));
assert!(content.is_char_boundary(content.len())); assert!(content.is_char_boundary(content.len()));
@ -283,7 +270,7 @@ mod tests {
let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap();
assert_eq!(params.len(), 0); assert_eq!(params.len(), 0);
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
assert_eq!(params.len(), 0); assert_eq!(params.len(), 0);
} }
@ -299,7 +286,7 @@ mod tests {
assert_eq!(param, emoji); assert_eq!(param, emoji);
} }
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
for (param, emoji) in params.iter().zip(emojis.iter()) { for (param, emoji) in params.iter().zip(emojis.iter()) {
assert_eq!(param, emoji); assert_eq!(param, emoji);
} }
@ -317,7 +304,7 @@ mod tests {
let reconstructed_content = params.join(""); let reconstructed_content = params.join("");
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
} }
@ -347,7 +334,7 @@ mod tests {
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
} }
@ -365,7 +352,7 @@ mod tests {
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
} }
@ -379,7 +366,7 @@ mod tests {
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
} }
@ -393,7 +380,7 @@ mod tests {
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); let params = group_paragraphs_by_max_content_len(params, max_tokens);
let reconstructed_content: String = params.concat(); let reconstructed_content: String = params.concat();
assert_eq!(reconstructed_content, content); assert_eq!(reconstructed_content, content);
} }

View file

@ -0,0 +1,91 @@
-- Drop existing primary key if it exists:
ALTER TABLE af_collab_embeddings
DROP CONSTRAINT IF EXISTS af_collab_embeddings_pkey;
-- Add a new composite primary key on (fragment_id, oid):
-- Currently the fragment_id is generated by hash fragment content, so fragment_id might be
-- conflicting with other fragments, but they are not in the same document.
ALTER TABLE af_collab_embeddings
ADD CONSTRAINT af_collab_embeddings_pkey
PRIMARY KEY (fragment_id, oid);
CREATE OR REPLACE PROCEDURE af_collab_embeddings_upsert(
IN p_workspace_id UUID,
IN p_oid TEXT,
IN p_tokens_used INT,
IN p_fragments af_fragment_v3[]
)
LANGUAGE plpgsql
AS
$$
BEGIN
-- Delete all fragments for p_oid that are not present in the new fragment list.
DELETE
FROM af_collab_embeddings
WHERE oid = p_oid
AND fragment_id NOT IN (
SELECT fragment_id FROM UNNEST(p_fragments) AS f
);
-- Use MERGE to update existing rows or insert new ones without causing duplicate key errors.
MERGE INTO af_collab_embeddings AS t
USING (
SELECT
f.fragment_id,
p_oid AS oid,
f.content_type,
f.contents,
f.embedding,
NOW() AS indexed_at,
f.metadata,
f.fragment_index,
f.embedder_type
FROM UNNEST(p_fragments) AS f
) AS s
ON t.oid = s.oid AND t.fragment_id = s.fragment_id
WHEN MATCHED THEN -- this fragment has not changed
UPDATE SET indexed_at = NOW()
WHEN NOT MATCHED THEN -- this fragment is new
INSERT (
fragment_id,
oid,
content_type,
content,
embedding,
indexed_at,
metadata,
fragment_index,
embedder_type
)
VALUES (
s.fragment_id,
s.oid,
s.content_type,
s.contents,
s.embedding,
NOW(),
s.metadata,
s.fragment_index,
s.embedder_type
);
-- Update the usage tracking table with an upsert.
INSERT INTO af_workspace_ai_usage(
created_at,
workspace_id,
search_requests,
search_tokens_consumed,
index_tokens_consumed
)
VALUES (
NOW()::date,
p_workspace_id,
0,
0,
p_tokens_used
)
ON CONFLICT (created_at, workspace_id)
DO UPDATE SET index_tokens_consumed = af_workspace_ai_usage.index_tokens_consumed + p_tokens_used;
END
$$;

View file

@ -1081,8 +1081,6 @@ impl CollabPersister {
// persisted one in the database // persisted one in the database
self.save_attempt(&mut snapshot.collab, message_id).await?; self.save_attempt(&mut snapshot.collab, message_id).await?;
} }
} else {
tracing::trace!("collab {} state has not changed", self.object_id);
} }
Ok(()) Ok(())
} }
@ -1112,9 +1110,7 @@ impl CollabPersister {
match self.collab_type { match self.collab_type {
CollabType::Document => { CollabType::Document => {
let txn = collab.transact(); let txn = collab.transact();
if let Some(text) = DocumentBody::from_collab(collab) if let Some(text) = DocumentBody::from_collab(collab).map(|body| body.paragraphs(txn)) {
.and_then(|body| body.to_plain_text(txn, false, true).ok())
{
self.index_collab_content(text); self.index_collab_content(text);
} }
}, },
@ -1166,12 +1162,12 @@ impl CollabPersister {
Ok(()) Ok(())
} }
fn index_collab_content(&self, text: String) { fn index_collab_content(&self, paragraphs: Vec<String>) {
let indexed_collab = UnindexedCollabTask::new( let indexed_collab = UnindexedCollabTask::new(
self.workspace_id, self.workspace_id,
self.object_id, self.object_id,
self.collab_type, self.collab_type,
UnindexedData::Text(text), UnindexedData::Paragraphs(paragraphs),
); );
if let Err(err) = self if let Err(err) = self
.indexer_scheduler .indexer_scheduler

View file

@ -10,7 +10,7 @@ fn document_plain_text() {
let doc = getting_started_document_data().unwrap(); let doc = getting_started_document_data().unwrap();
let collab = Collab::new_with_origin(CollabOrigin::Server, "1", vec![], false); let collab = Collab::new_with_origin(CollabOrigin::Server, "1", vec![], false);
let document = Document::create_with_data(collab, doc).unwrap(); let document = Document::create_with_data(collab, doc).unwrap();
let text = document.to_plain_text(false, true).unwrap(); let text = document.paragraphs().join("");
let expected = "Welcome to AppFlowy $ Download for macOS, Windows, and Linux link $ $ $ quick start Ask AI powered by advanced AI models: chat, search, write, and much more ✨ ❤\u{fe0f}Love AppFlowy and open source? Follow our latest product updates: Twitter : @appflowy Reddit : r/appflowy Github "; let expected = "Welcome to AppFlowy $ Download for macOS, Windows, and Linux link $ $ $ quick start Ask AI powered by advanced AI models: chat, search, write, and much more ✨ ❤\u{fe0f}Love AppFlowy and open source? Follow our latest product updates: Twitter : @appflowy Reddit : r/appflowy Github ";
assert_eq!(&text, expected); assert_eq!(&text, expected);
} }
@ -20,7 +20,7 @@ fn document_plain_text_with_nested_blocks() {
let doc = get_initial_document_data().unwrap(); let doc = get_initial_document_data().unwrap();
let collab = Collab::new_with_origin(CollabOrigin::Server, "1", vec![], false); let collab = Collab::new_with_origin(CollabOrigin::Server, "1", vec![], false);
let document = Document::create_with_data(collab, doc).unwrap(); let document = Document::create_with_data(collab, doc).unwrap();
let text = document.to_plain_text(false, true).unwrap(); let text = document.paragraphs().join("");
let expected = "Welcome to AppFlowy! Here are the basics Here is H3 Click anywhere and just start typing. Click Enter to create a new line. Highlight any text, and use the editing menu to style your writing however you like. As soon as you type / a menu will pop up. Select different types of content blocks you can add. Type / followed by /bullet or /num to create a list. Click + New Page button at the bottom of your sidebar to add a new page. Click + next to any page title in the sidebar to quickly add a new subpage, Document , Grid , or Kanban Board . Keyboard shortcuts, markdown, and code block Keyboard shortcuts guide Markdown reference Type /code to insert a code block // This is the main function.\nfn main() {\n // Print text to the console.\n println!(\"Hello World!\");\n} This is a paragraph This is a paragraph Have a question❓ Click ? at the bottom right for help and support. This is a paragraph This is a paragraph Click ? at the bottom right for help and support. Like AppFlowy? Follow us: GitHub Twitter : @appflowy Newsletter "; let expected = "Welcome to AppFlowy! Here are the basics Here is H3 Click anywhere and just start typing. Click Enter to create a new line. Highlight any text, and use the editing menu to style your writing however you like. As soon as you type / a menu will pop up. Select different types of content blocks you can add. Type / followed by /bullet or /num to create a list. Click + New Page button at the bottom of your sidebar to add a new page. Click + next to any page title in the sidebar to quickly add a new subpage, Document , Grid , or Kanban Board . Keyboard shortcuts, markdown, and code block Keyboard shortcuts guide Markdown reference Type /code to insert a code block // This is the main function.\nfn main() {\n // Print text to the console.\n println!(\"Hello World!\");\n} This is a paragraph This is a paragraph Have a question❓ Click ? at the bottom right for help and support. This is a paragraph This is a paragraph Click ? at the bottom right for help and support. Like AppFlowy? Follow us: GitHub Twitter : @appflowy Newsletter ";
assert_eq!(&text, expected); assert_eq!(&text, expected);
} }

View file

@ -1003,7 +1003,7 @@ async fn process_unzip_file(
Ok(bytes) => { Ok(bytes) => {
if let Err(err) = redis_client if let Err(err) = redis_client
.set_ex::<String, Vec<u8>, Value>( .set_ex::<String, Vec<u8>, Value>(
encode_collab_key(&w_database_id.to_string()), encode_collab_key(w_database_id.to_string()),
bytes, bytes,
2592000, // WorkspaceDatabase => 1 month 2592000, // WorkspaceDatabase => 1 month
) )
@ -1186,7 +1186,7 @@ async fn process_unzip_file(
}); });
if result.is_err() { if result.is_err() {
let _: RedisResult<Value> = redis_client.del(encode_collab_key(&w_database_id)).await; let _: RedisResult<Value> = redis_client.del(encode_collab_key(w_database_id)).await;
let _: RedisResult<Value> = redis_client let _: RedisResult<Value> = redis_client
.del(encode_collab_key(&import_task.workspace_id)) .del(encode_collab_key(&import_task.workspace_id))
.await; .await;

View file

@ -1,6 +1,6 @@
use app_error::AppError; use app_error::AppError;
use database::index::get_collabs_indexed_at; use database::index::{get_collab_embedding_fragment_ids, get_collabs_indexed_at};
use indexer::collab_indexer::{Indexer, IndexerProvider}; use indexer::collab_indexer::IndexerProvider;
use indexer::entity::EmbeddingRecord; use indexer::entity::EmbeddingRecord;
use indexer::error::IndexerError; use indexer::error::IndexerError;
use indexer::metrics::EmbeddingMetrics; use indexer::metrics::EmbeddingMetrics;
@ -12,7 +12,6 @@ use indexer::scheduler::{spawn_pg_write_embeddings, UnindexedCollabTask, Unindex
use indexer::thread_pool::ThreadPoolNoAbort; use indexer::thread_pool::ThreadPoolNoAbort;
use indexer::vector::embedder::Embedder; use indexer::vector::embedder::Embedder;
use indexer::vector::open_ai; use indexer::vector::open_ai;
use rayon::prelude::*;
use redis::aio::ConnectionManager; use redis::aio::ConnectionManager;
use secrecy::{ExposeSecret, Secret}; use secrecy::{ExposeSecret, Secret};
use sqlx::PgPool; use sqlx::PgPool;
@ -20,8 +19,9 @@ use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
use tokio::sync::RwLock; use tokio::sync::RwLock;
use tokio::task::JoinSet;
use tokio::time::{interval, MissedTickBehavior}; use tokio::time::{interval, MissedTickBehavior};
use tracing::{error, info, trace}; use tracing::{error, info, trace, warn};
pub struct BackgroundIndexerConfig { pub struct BackgroundIndexerConfig {
pub enable: bool, pub enable: bool,
@ -134,7 +134,7 @@ async fn process_upcoming_tasks(
let collab_ids: Vec<_> = tasks.iter().map(|task| task.object_id).collect(); let collab_ids: Vec<_> = tasks.iter().map(|task| task.object_id).collect();
let indexed_collabs = get_collabs_indexed_at(&pg_pool, collab_ids) let indexed_collabs = get_collabs_indexed_at(&pg_pool, collab_ids.clone())
.await .await
.unwrap_or_default(); .unwrap_or_default();
@ -154,36 +154,78 @@ async fn process_upcoming_tasks(
let start = Instant::now(); let start = Instant::now();
let num_tasks = tasks.len(); let num_tasks = tasks.len();
tasks.into_par_iter().for_each(|task| { let existing_embeddings = get_collab_embedding_fragment_ids(&pg_pool, collab_ids)
let result = threads.install(|| { .await
if let Some(indexer) = indexer_provider.indexer_for(task.collab_type) { .unwrap_or_default();
let embedder = create_embedder(&config); let mut join_set = JoinSet::new();
let result = handle_task(embedder, indexer, task); for task in tasks {
match result { if let Some(indexer) = indexer_provider.indexer_for(task.collab_type) {
None => metrics.record_failed_embed_count(1), let embedder = create_embedder(&config);
Some(record) => { trace!(
metrics.record_embed_count(1); "[Background Embedding] processing task: {}, content:{:?}, collab_type: {}",
trace!( task.object_id,
"[Background Embedding] send {} embedding record to write task", task.data,
record.object_id task.collab_type
); );
if let Err(err) = sender.send(record) { let paragraphs = match task.data {
trace!( UnindexedData::Paragraphs(paragraphs) => paragraphs,
"[Background Embedding] failed to send embedding record to write task: {:?}", UnindexedData::Text(text) => text.split('\n').map(|s| s.to_string()).collect(),
err };
); let mut chunks = match indexer.create_embedded_chunks_from_text(
} task.object_id,
}, paragraphs,
embedder.model(),
) {
Ok(chunks) => chunks,
Err(err) => {
warn!(
"[Background Embedding] failed to create embedded chunks for task: {}, error: {:?}",
task.object_id,
err
);
continue;
},
};
if let Some(existing_chunks) = existing_embeddings.get(&task.object_id) {
for chunk in chunks.iter_mut() {
if existing_chunks.contains(&chunk.fragment_id) {
chunk.content = None; // Clear content to mark unchanged chunk
chunk.embedding = None;
}
} }
} }
}); join_set.spawn(async move {
if let Err(err) = result { let embeddings = indexer.embed(&embedder, chunks).await.ok()?;
error!( embeddings.map(|embeddings| EmbeddingRecord {
"[Background Embedding] Failed to process embedder task: {:?}", workspace_id: task.workspace_id,
err object_id: task.object_id,
); collab_type: task.collab_type,
tokens_used: embeddings.tokens_consumed,
contents: embeddings.params,
})
});
} }
}); }
while let Some(Ok(result)) = join_set.join_next().await {
match result {
None => metrics.record_failed_embed_count(1),
Some(record) => {
metrics.record_embed_count(1);
trace!(
"[Background Embedding] send {} embedding record to write task",
record.object_id
);
if let Err(err) = sender.send(record) {
trace!(
"[Background Embedding] failed to send embedding record to write task: {:?}",
err
);
}
},
}
}
let cost = start.elapsed().as_millis(); let cost = start.elapsed().as_millis();
metrics.record_gen_embedding_time(num_tasks as u32, cost); metrics.record_gen_embedding_time(num_tasks as u32, cost);
} }
@ -212,32 +254,6 @@ async fn process_upcoming_tasks(
} }
} }
fn handle_task(
embedder: Embedder,
indexer: Arc<dyn Indexer>,
task: UnindexedCollabTask,
) -> Option<EmbeddingRecord> {
trace!(
"[Background Embedding] processing task: {}, content:{:?}, collab_type: {}",
task.object_id,
task.data,
task.collab_type
);
let chunks = match task.data {
UnindexedData::Text(text) => indexer
.create_embedded_chunks_from_text(task.object_id.clone(), text, embedder.model())
.ok()?,
};
let embeddings = indexer.embed(&embedder, chunks).ok()?;
embeddings.map(|embeddings| EmbeddingRecord {
workspace_id: task.workspace_id,
object_id: task.object_id,
collab_type: task.collab_type,
tokens_used: embeddings.tokens_consumed,
contents: embeddings.params,
})
}
fn create_embedder(config: &BackgroundIndexerConfig) -> Embedder { fn create_embedder(config: &BackgroundIndexerConfig) -> Embedder {
Embedder::OpenAI(open_ai::Embedder::new( Embedder::OpenAI(open_ai::Embedder::new(
config.open_api_key.expose_secret().clone(), config.open_api_key.expose_secret().clone(),

View file

@ -856,12 +856,12 @@ async fn create_collab_handler(
.can_index_workspace(&workspace_id) .can_index_workspace(&workspace_id)
.await? .await?
{ {
if let Ok(text) = Document::open(collab).and_then(|doc| doc.to_plain_text(false, true)) { if let Ok(paragraphs) = Document::open(collab).map(|doc| doc.paragraphs()) {
let pending = UnindexedCollabTask::new( let pending = UnindexedCollabTask::new(
workspace_id, workspace_id,
params.object_id, params.object_id,
params.collab_type, params.collab_type,
UnindexedData::Text(text), UnindexedData::Paragraphs(paragraphs),
); );
state state
.indexer_scheduler .indexer_scheduler
@ -958,8 +958,7 @@ async fn batch_create_collab_handler(
Ok(_) => { Ok(_) => {
match params.collab_type { match params.collab_type {
CollabType::Document => { CollabType::Document => {
let index_text = let index_text = Document::open(collab).map(|doc| doc.paragraphs());
Document::open(collab).and_then(|doc| doc.to_plain_text(false, true));
Some((Some(index_text), params)) Some((Some(index_text), params))
}, },
_ => { _ => {
@ -1010,12 +1009,12 @@ async fn batch_create_collab_handler(
.flat_map(|value| match std::mem::take(&mut value.0) { .flat_map(|value| match std::mem::take(&mut value.0) {
None => None, None => None,
Some(text) => text Some(text) => text
.map(|text| { .map(|paragraphs| {
UnindexedCollabTask::new( UnindexedCollabTask::new(
workspace_id, workspace_id,
value.1.object_id, value.1.object_id,
value.1.collab_type, value.1.collab_type,
UnindexedData::Text(text), UnindexedData::Paragraphs(paragraphs),
) )
}) })
.ok(), .ok(),
@ -1826,16 +1825,18 @@ async fn update_collab_handler(
)) ))
})?; })?;
if let Ok(text) = Document::open(collab).and_then(|doc| doc.to_plain_text(false, true)) { if let Ok(paragraphs) = Document::open(collab).map(|doc| doc.paragraphs()) {
let pending = UnindexedCollabTask::new( if !paragraphs.is_empty() {
workspace_id, let pending = UnindexedCollabTask::new(
params.object_id, workspace_id,
params.collab_type, params.object_id,
UnindexedData::Text(text), params.collab_type,
); UnindexedData::Paragraphs(paragraphs),
state );
.indexer_scheduler state
.index_pending_collab_one(pending, true)?; .indexer_scheduler
.index_pending_collab_one(pending, true)?;
}
} }
}, },
_ => { _ => {

View file

@ -1007,12 +1007,7 @@ fn fill_in_db_row_doc(
})?; })?;
let doc = Document::open(doc_collab) let doc = Document::open(doc_collab)
.map_err(|err| AppError::Internal(anyhow::anyhow!("Failed to open document: {:?}", err)))?; .map_err(|err| AppError::Internal(anyhow::anyhow!("Failed to open document: {:?}", err)))?;
let plain_text = doc.to_plain_text(true, false).map_err(|err| { let plain_text = doc.paragraphs().join("");
AppError::Internal(anyhow::anyhow!(
"Failed to convert document to plain text: {:?}",
err
))
})?;
row_detail.doc = Some(plain_text); row_detail.doc = Some(plain_text);
Ok(()) Ok(())
} }

View file

@ -609,9 +609,7 @@ pub async fn get_database_row_doc_changes(
.map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to create document: {:?}", e)))?; .map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to create document: {:?}", e)))?;
// if the document content is the same, there is no need to update // if the document content is the same, there is no need to update
if cur_doc.to_plain_text(false, false).unwrap_or_default() if cur_doc.paragraphs() == new_doc.paragraphs() {
== new_doc.to_plain_text(false, false).unwrap_or_default()
{
return Ok(None); return Ok(None);
}; };

View file

@ -257,7 +257,7 @@ Overall, Alex balances his work as a software programmer with his passion for sp
// Simulate insert new content // Simulate insert new content
let contents = alex_banker_story(); let contents = alex_banker_story();
editor.insert_paragraphs(contents.into_iter().map(|s| s.to_string()).collect()); editor.insert_paragraphs(contents.into_iter().map(|s| s.to_string()).collect());
let text = editor.document.to_plain_text(false, false).unwrap(); let text = editor.document.paragraphs().join("");
let expected = alex_banker_story().join(""); let expected = alex_banker_story().join("");
assert_eq!(text, expected); assert_eq!(text, expected);

View file

@ -90,13 +90,13 @@ async fn document_full_sync_then_search_test() {
let remote_document = test_client let remote_document = test_client
.create_document_collab(workspace_id, object_id) .create_document_collab(workspace_id, object_id)
.await; .await;
let remote_plain_text = remote_document.to_plain_text(false, false).unwrap(); let remote_plain_text = remote_document.paragraphs().join("");
let local_plain_text = local_document.document.to_plain_text(false, false).unwrap(); let local_plain_text = local_document.document.paragraphs().join("");
assert_eq!(local_plain_text, remote_plain_text); assert_eq!(local_plain_text, remote_plain_text);
let search_result = test_client let search_result = test_client
.wait_unit_get_search_result(&workspace_id, "workflows", 1) .wait_unit_get_search_result(&workspace_id, "workflows", 1)
.await; .await;
assert_eq!(search_result.len(), 1); assert_eq!(search_result.len(), 1);
assert_eq!(search_result[0].preview, Some("AppFlowy is an open-source project. It is an alternative to tools like Notion. AppFlowy provides full control of your data. The project is built using Flutter for the frontend. Rust powers AppFlowy's ".to_string())); assert_eq!(search_result[0].preview, Some("AppFlowy is an open-source project.It is an alternative to tools like Notion.AppFlowy provides full control of your data.The project is built using Flutter for the frontend.Rust powers AppFlowy's back".to_string()));
} }

View file

@ -35,7 +35,7 @@ async fn database_row_upsert_with_doc() {
assert!(row_detail.has_doc); assert!(row_detail.has_doc);
assert_eq!( assert_eq!(
row_detail.doc, row_detail.doc,
Some(String::from("\nThis is a document of a database row")) Some(String::from("This is a document of a database row"))
); );
} }
// Upsert row with another doc // Upsert row with another doc
@ -57,7 +57,7 @@ async fn database_row_upsert_with_doc() {
.unwrap()[0]; .unwrap()[0];
assert_eq!( assert_eq!(
row_detail.doc, row_detail.doc,
Some(String::from("\nThis is a another document")) Some(String::from("This is a another document"))
); );
} }
} }
@ -135,7 +135,7 @@ async fn database_row_upsert() {
assert!(row_detail.has_doc); assert!(row_detail.has_doc);
assert_eq!( assert_eq!(
row_detail.doc, row_detail.doc,
Some("\nThis is a document of a database row".to_string()) Some("This is a document of a database row".to_string())
); );
} }
} }
@ -327,6 +327,6 @@ async fn database_insert_row_with_doc() {
assert!(row_detail.has_doc); assert!(row_detail.has_doc);
assert_eq!( assert_eq!(
row_detail.doc, row_detail.doc,
Some("\nThis is a document of a database row".to_string()) Some("This is a document of a database row".to_string())
); );
} }