Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

VectorStore UX improvements #2722

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -89,6 +89,7 @@ def add(
embedding_tensor: Union[str, List[str]],
return_ids: bool,
rate_limiter: Dict,
ingestion_batch_size: int,
**tensors,
):
feature_report_path(
Expand All @@ -100,6 +101,7 @@ def add(
"return_ids": return_ids,
"embedding_function": True if embedding_function is not None else False,
"embedding_data": True if embedding_data is not None else False,
"ingestion_batch_size": ingestion_batch_size,
},
token=self.token,
username=self.username,
Expand Down Expand Up @@ -144,6 +146,7 @@ def add(
embedding_tensor=embedding_tensor,
rate_limiter=rate_limiter,
logger=self.logger,
ingestion_batch_size=ingestion_batch_size,
)

if self.verbose:
Expand Down
Expand Up @@ -127,6 +127,7 @@ def add(
embedding_tensor: Union[str, List[str]],
return_ids: bool,
rate_limiter: Dict,
ingestion_batch_size: int,
**tensors,
):
pass
Expand Down
3 changes: 3 additions & 0 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Expand Up @@ -14,6 +14,7 @@
DEFAULT_VECTORSTORE_TENSORS,
MAX_BYTES_PER_MINUTE,
TARGET_BYTE_SIZE,
VECTORSTORE_EXTEND_BATCH_SIZE,
)
from deeplake.util.bugout_reporter import feature_report_path
from deeplake.util.exceptions import DeepMemoryWaitingListError
Expand Down Expand Up @@ -152,6 +153,7 @@ def add(
"bytes_per_minute": MAX_BYTES_PER_MINUTE,
"batch_byte_size": TARGET_BYTE_SIZE,
},
ingestion_batch_size: int = VECTORSTORE_EXTEND_BATCH_SIZE,
**tensors,
) -> Optional[List[str]]:
"""Adding elements to deeplake vector store.
Expand Down Expand Up @@ -226,6 +228,7 @@ def add(
embedding_tensor=embedding_tensor,
return_ids=return_ids,
rate_limiter=rate_limiter,
ingestion_batch_size=ingestion_batch_size,
**tensors,
)

Expand Down
12 changes: 7 additions & 5 deletions deeplake/core/vectorstore/vector_search/dataset/dataset.py
Expand Up @@ -465,7 +465,7 @@ def extend(
processed_tensors: Dict[str, Union[List[Any], np.ndarray]],
dataset: deeplake.core.dataset.Dataset,
rate_limiter: Dict,
_extend_batch_size: int = VECTORSTORE_EXTEND_BATCH_SIZE,
ingestion_batch_size: int,
logger=None,
):
"""
Expand All @@ -475,17 +475,17 @@ def extend(
embedding_data = [embedding_data]

if embedding_function:
number_of_batches = ceil(len(embedding_data[0]) / _extend_batch_size)
number_of_batches = ceil(len(embedding_data[0]) / ingestion_batch_size)
progressbar_str = (
f"Creating {len(embedding_data[0])} embeddings in "
f"{number_of_batches} batches of size {min(_extend_batch_size, len(embedding_data[0]))}:"
f"{number_of_batches} batches of size {min(ingestion_batch_size, len(embedding_data[0]))}:"
)

for idx in tqdm(
range(0, len(embedding_data[0]), _extend_batch_size),
range(0, len(embedding_data[0]), ingestion_batch_size),
progressbar_str,
):
batch_start, batch_end = idx, idx + _extend_batch_size
batch_start, batch_end = idx, idx + ingestion_batch_size

batched_embeddings = _compute_batched_embeddings(
embedding_function,
Expand Down Expand Up @@ -537,6 +537,7 @@ def extend_or_ingest_dataset(
embedding_tensor,
embedding_data,
rate_limiter,
ingestion_batch_size,
logger,
):
rate_limiter = populate_rate_limiter(rate_limiter)
Expand All @@ -548,6 +549,7 @@ def extend_or_ingest_dataset(
processed_tensors,
dataset,
rate_limiter,
ingestion_batch_size,
logger=logger,
)

Expand Down