-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This introduces semantic indexing in Zed based on chunking text from files in the developer's workspace and creating vector embeddings using an embedding model. As part of this, we've created an embeddings provider trait that allows us to work with OpenAI, a local Ollama model, or a Zed hosted embedding. The semantic index is built by breaking down text for known (programming) languages into manageable chunks that are smaller than the max token size. Each chunk is then fed to a language model to create a high dimensional vector which is then normalized to a unit vector to allow fast comparison with other vectors with a simple dot product. Alongside the vector, we store the path of the file and the range within the document where the vector was sourced from. Zed will soon grok contextual similarity across different text snippets, allowing for natural language search beyond keyword matching. This is being put together both for human-based search as well as providing results to Large Language Models to allow them to refine how they help developers. Remaining todo: * [x] Change `provider` to `model` within the zed hosted embeddings database (as its currently a combo of the provider and the model in one name) Release Notes: - N/A --------- Co-authored-by: Nathan Sobo <nathan@zed.dev> Co-authored-by: Antonio Scandurra <me@as-cii.com> Co-authored-by: Conrad Irwin <conrad@zed.dev> Co-authored-by: Marshall Bowers <elliott.codes@gmail.com> Co-authored-by: Antonio <antonio@zed.dev>
- Loading branch information
1 parent
4b40e83
commit 49371b4
Showing
33 changed files
with
2,649 additions
and
41 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
crates/collab/migrations/20240409082755_create_embeddings.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
CREATE TABLE IF NOT EXISTS "embeddings" ( | ||
"model" TEXT, | ||
"digest" BYTEA, | ||
"dimensions" FLOAT4[1536], | ||
"retrieved_at" TIMESTAMP NOT NULL DEFAULT now(), | ||
PRIMARY KEY ("model", "digest") | ||
); | ||
|
||
CREATE INDEX IF NOT EXISTS "idx_retrieved_at_on_embeddings" ON "embeddings" ("retrieved_at"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
use super::*; | ||
use time::Duration; | ||
use time::OffsetDateTime; | ||
|
||
impl Database { | ||
pub async fn get_embeddings( | ||
&self, | ||
model: &str, | ||
digests: &[Vec<u8>], | ||
) -> Result<HashMap<Vec<u8>, Vec<f32>>> { | ||
self.weak_transaction(|tx| async move { | ||
let embeddings = { | ||
let mut db_embeddings = embedding::Entity::find() | ||
.filter( | ||
embedding::Column::Model.eq(model).and( | ||
embedding::Column::Digest | ||
.is_in(digests.iter().map(|digest| digest.as_slice())), | ||
), | ||
) | ||
.stream(&*tx) | ||
.await?; | ||
|
||
let mut embeddings = HashMap::default(); | ||
while let Some(db_embedding) = db_embeddings.next().await { | ||
let db_embedding = db_embedding?; | ||
embeddings.insert(db_embedding.digest, db_embedding.dimensions); | ||
} | ||
embeddings | ||
}; | ||
|
||
if !embeddings.is_empty() { | ||
let now = OffsetDateTime::now_utc(); | ||
let retrieved_at = PrimitiveDateTime::new(now.date(), now.time()); | ||
|
||
embedding::Entity::update_many() | ||
.filter( | ||
embedding::Column::Digest | ||
.is_in(embeddings.keys().map(|digest| digest.as_slice())), | ||
) | ||
.col_expr(embedding::Column::RetrievedAt, Expr::value(retrieved_at)) | ||
.exec(&*tx) | ||
.await?; | ||
} | ||
|
||
Ok(embeddings) | ||
}) | ||
.await | ||
} | ||
|
||
pub async fn save_embeddings( | ||
&self, | ||
model: &str, | ||
embeddings: &HashMap<Vec<u8>, Vec<f32>>, | ||
) -> Result<()> { | ||
self.weak_transaction(|tx| async move { | ||
embedding::Entity::insert_many(embeddings.iter().map(|(digest, dimensions)| { | ||
let now_offset_datetime = OffsetDateTime::now_utc(); | ||
let retrieved_at = | ||
PrimitiveDateTime::new(now_offset_datetime.date(), now_offset_datetime.time()); | ||
|
||
embedding::ActiveModel { | ||
model: ActiveValue::set(model.to_string()), | ||
digest: ActiveValue::set(digest.clone()), | ||
dimensions: ActiveValue::set(dimensions.clone()), | ||
retrieved_at: ActiveValue::set(retrieved_at), | ||
} | ||
})) | ||
.on_conflict( | ||
OnConflict::columns([embedding::Column::Model, embedding::Column::Digest]) | ||
.do_nothing() | ||
.to_owned(), | ||
) | ||
.exec_without_returning(&*tx) | ||
.await?; | ||
Ok(()) | ||
}) | ||
.await | ||
} | ||
|
||
pub async fn purge_old_embeddings(&self) -> Result<()> { | ||
self.weak_transaction(|tx| async move { | ||
embedding::Entity::delete_many() | ||
.filter( | ||
embedding::Column::RetrievedAt | ||
.lte(OffsetDateTime::now_utc() - Duration::days(60)), | ||
) | ||
.exec(&*tx) | ||
.await?; | ||
|
||
Ok(()) | ||
}) | ||
.await | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
use sea_orm::entity::prelude::*; | ||
use time::PrimitiveDateTime; | ||
|
||
#[derive(Clone, Debug, PartialEq, DeriveEntityModel)] | ||
#[sea_orm(table_name = "embeddings")] | ||
pub struct Model { | ||
#[sea_orm(primary_key)] | ||
pub model: String, | ||
#[sea_orm(primary_key)] | ||
pub digest: Vec<u8>, | ||
pub dimensions: Vec<f32>, | ||
pub retrieved_at: PrimitiveDateTime, | ||
} | ||
|
||
#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] | ||
pub enum Relation {} | ||
|
||
impl ActiveModelBehavior for ActiveModel {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.