Skip to content

Commit

Permalink
Semantic Index (#10329)
Browse files Browse the repository at this point in the history
This introduces semantic indexing in Zed based on chunking text from
files in the developer's workspace and creating vector embeddings using
an embedding model. As part of this, we've created an embeddings
provider trait that allows us to work with OpenAI, a local Ollama model,
or a Zed hosted embedding.

The semantic index is built by breaking down text for known
(programming) languages into manageable chunks that are smaller than the
max token size. Each chunk is then fed to a language model to create a
high dimensional vector which is then normalized to a unit vector to
allow fast comparison with other vectors with a simple dot product.
Alongside the vector, we store the path of the file and the range within
the document where the vector was sourced from.

Zed will soon grok contextual similarity across different text snippets,
allowing for natural language search beyond keyword matching. This is
being put together both for human-based search as well as providing
results to Large Language Models to allow them to refine how they help
developers.

Remaining todo:

* [x] Change `provider` to `model` within the zed hosted embeddings
database (as its currently a combo of the provider and the model in one
name)


Release Notes:

- N/A

---------

Co-authored-by: Nathan Sobo <nathan@zed.dev>
Co-authored-by: Antonio Scandurra <me@as-cii.com>
Co-authored-by: Conrad Irwin <conrad@zed.dev>
Co-authored-by: Marshall Bowers <elliott.codes@gmail.com>
Co-authored-by: Antonio <antonio@zed.dev>
  • Loading branch information
6 people committed Apr 12, 2024
1 parent 4b40e83 commit 49371b4
Show file tree
Hide file tree
Showing 33 changed files with 2,649 additions and 41 deletions.
143 changes: 143 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ members = [
"crates/task",
"crates/tasks_ui",
"crates/search",
"crates/semantic_index",
"crates/semantic_version",
"crates/settings",
"crates/snippet",
Expand Down Expand Up @@ -253,9 +254,11 @@ derive_more = "0.99.17"
emojis = "0.6.1"
env_logger = "0.9"
futures = "0.3"
futures-batch = "0.6.1"
futures-lite = "1.13"
git2 = { version = "0.15", default-features = false }
globset = "0.4"
heed = { git = "https://github.com/meilisearch/heed", rev = "036ac23f73a021894974b9adc815bc95b3e0482a", features = ["read-txn-no-tls"] }
hex = "0.4.3"
ignore = "0.4.22"
indoc = "1"
Expand Down
4 changes: 2 additions & 2 deletions crates/channel/src/channel_store_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ async fn test_channel_messages(cx: &mut TestAppContext) {
);

assert_eq!(
channel.next_event(cx),
channel.next_event(cx).await,
ChannelChatEvent::MessagesUpdated {
old_range: 2..2,
new_count: 1,
Expand Down Expand Up @@ -317,7 +317,7 @@ async fn test_channel_messages(cx: &mut TestAppContext) {
);

assert_eq!(
channel.next_event(cx),
channel.next_event(cx).await,
ChannelChatEvent::MessagesUpdated {
old_range: 0..0,
new_count: 2,
Expand Down
9 changes: 9 additions & 0 deletions crates/collab/migrations/20240409082755_create_embeddings.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
CREATE TABLE IF NOT EXISTS "embeddings" (
"model" TEXT,
"digest" BYTEA,
"dimensions" FLOAT4[1536],
"retrieved_at" TIMESTAMP NOT NULL DEFAULT now(),
PRIMARY KEY ("model", "digest")
);

CREATE INDEX IF NOT EXISTS "idx_retrieved_at_on_embeddings" ON "embeddings" ("retrieved_at");
1 change: 1 addition & 0 deletions crates/collab/src/db/queries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pub mod channels;
pub mod contacts;
pub mod contributors;
pub mod dev_servers;
pub mod embeddings;
pub mod extensions;
pub mod hosted_projects;
pub mod messages;
Expand Down
94 changes: 94 additions & 0 deletions crates/collab/src/db/queries/embeddings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
use super::*;
use time::Duration;
use time::OffsetDateTime;

impl Database {
pub async fn get_embeddings(
&self,
model: &str,
digests: &[Vec<u8>],
) -> Result<HashMap<Vec<u8>, Vec<f32>>> {
self.weak_transaction(|tx| async move {
let embeddings = {
let mut db_embeddings = embedding::Entity::find()
.filter(
embedding::Column::Model.eq(model).and(
embedding::Column::Digest
.is_in(digests.iter().map(|digest| digest.as_slice())),
),
)
.stream(&*tx)
.await?;

let mut embeddings = HashMap::default();
while let Some(db_embedding) = db_embeddings.next().await {
let db_embedding = db_embedding?;
embeddings.insert(db_embedding.digest, db_embedding.dimensions);
}
embeddings
};

if !embeddings.is_empty() {
let now = OffsetDateTime::now_utc();
let retrieved_at = PrimitiveDateTime::new(now.date(), now.time());

embedding::Entity::update_many()
.filter(
embedding::Column::Digest
.is_in(embeddings.keys().map(|digest| digest.as_slice())),
)
.col_expr(embedding::Column::RetrievedAt, Expr::value(retrieved_at))
.exec(&*tx)
.await?;
}

Ok(embeddings)
})
.await
}

pub async fn save_embeddings(
&self,
model: &str,
embeddings: &HashMap<Vec<u8>, Vec<f32>>,
) -> Result<()> {
self.weak_transaction(|tx| async move {
embedding::Entity::insert_many(embeddings.iter().map(|(digest, dimensions)| {
let now_offset_datetime = OffsetDateTime::now_utc();
let retrieved_at =
PrimitiveDateTime::new(now_offset_datetime.date(), now_offset_datetime.time());

embedding::ActiveModel {
model: ActiveValue::set(model.to_string()),
digest: ActiveValue::set(digest.clone()),
dimensions: ActiveValue::set(dimensions.clone()),
retrieved_at: ActiveValue::set(retrieved_at),
}
}))
.on_conflict(
OnConflict::columns([embedding::Column::Model, embedding::Column::Digest])
.do_nothing()
.to_owned(),
)
.exec_without_returning(&*tx)
.await?;
Ok(())
})
.await
}

pub async fn purge_old_embeddings(&self) -> Result<()> {
self.weak_transaction(|tx| async move {
embedding::Entity::delete_many()
.filter(
embedding::Column::RetrievedAt
.lte(OffsetDateTime::now_utc() - Duration::days(60)),
)
.exec(&*tx)
.await?;

Ok(())
})
.await
}
}
1 change: 1 addition & 0 deletions crates/collab/src/db/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub mod channel_message_mention;
pub mod contact;
pub mod contributor;
pub mod dev_server;
pub mod embedding;
pub mod extension;
pub mod extension_version;
pub mod feature_flag;
Expand Down
18 changes: 18 additions & 0 deletions crates/collab/src/db/tables/embedding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
use sea_orm::entity::prelude::*;
use time::PrimitiveDateTime;

#[derive(Clone, Debug, PartialEq, DeriveEntityModel)]
#[sea_orm(table_name = "embeddings")]
pub struct Model {
#[sea_orm(primary_key)]
pub model: String,
#[sea_orm(primary_key)]
pub digest: Vec<u8>,
pub dimensions: Vec<f32>,
pub retrieved_at: PrimitiveDateTime,
}

#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)]
pub enum Relation {}

impl ActiveModelBehavior for ActiveModel {}
1 change: 1 addition & 0 deletions crates/collab/src/db/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ mod buffer_tests;
mod channel_tests;
mod contributor_tests;
mod db_tests;
mod embedding_tests;
mod extension_tests;
mod feature_flag_tests;
mod message_tests;
Expand Down

0 comments on commit 49371b4

Please sign in to comment.