Skip to content

Commit

Permalink
deepmemory fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
adolkhan committed Dec 28, 2023
1 parent d20e4d6 commit c0a76dd
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 85 deletions.
Expand Up @@ -109,7 +109,7 @@ def __init__(
self.index_params = utils.parse_index_params(index_params)
kwargs["index_params"] = self.index_params
self.num_workers = num_workers
self.creds = creds or {}
self.creds = creds
self.embedding_function = utils.create_embedding_function(embedding_function)
self.tensor_params = tensor_params
self.read_only = read_only
Expand Down
Expand Up @@ -104,6 +104,15 @@ def __init__(

self.verbose = verbose

self.deep_memory = DeepMemory(
dataset=None,
path=self.path,
token=self.token,
logger=self.logger,
embedding_function=self.embedding_function,
creds=self.creds,
)

# verifying not implemented args
self.args_verifier.verify_init_args(
cls=self,
Expand All @@ -113,6 +122,7 @@ def __init__(
creds=creds,
org_id=org_id,
other_kwargs=kwargs,
deep_memory=self.deep_memory,
)

self.client = ManagedServiceClient(token=self.token)
Expand All @@ -128,14 +138,11 @@ def __init__(
log_visualizer_link(response.path)
logger.info(response.summary)

self.deep_memory = DeepMemory(
dataset=None,
path=self.path,
token=self.token,
logger=self.logger,
embedding_function=self.embedding_function,
creds=self.creds,
)
if self.deep_memory is not None and self.embedding_function is not None:
logger.warning(
"ManagedVectorStore does not support passing embedding_function for now. "
"Embedding function will be used only for deepmemory training and inference."
)

def add(
self,
Expand Down Expand Up @@ -167,6 +174,7 @@ def add(
# verifying not implemented args
self.args_verifier.verify_add_args(
embedding_function=embedding_function,
deep_memory=self.deep_memory,
embedding_data=embedding_data,
embedding_tensor=embedding_tensor,
rate_limiter=rate_limiter,
Expand Down Expand Up @@ -215,6 +223,7 @@ def search(
return_view: bool,
deep_memory: bool,
exec_option: Optional[str] = "tensor_db",
return_tql: bool = False,
) -> Union[Dict, Dataset]:
feature_report_path(
path=self.bugout_reporting_path,
Expand Down Expand Up @@ -244,6 +253,7 @@ def search(
exec_option=exec_option,
return_view=return_view,
filter=filter,
return_tql=return_tql,
)

response = self.client.vectorstore_search(
Expand Down Expand Up @@ -374,6 +384,10 @@ def delete_by_path(
creds=creds,
)

@property
def exec_option(self):
return "tensor_db"

def _get_summary(self):
"""Returns a summary of the Managed Vector Store."""
return self.client.get_vectorstore_summary(self.path)

Check warning on line 393 in deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py

View check run for this annotation

Codecov / codecov/patch

deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py#L393

Added line #L393 was not covered by tests
Expand Down Expand Up @@ -426,11 +440,16 @@ def _verify_filter_is_dictionary(self):
"Only Filter Dictionary is supported for the ManagedVectorStore."
)

def _verify_kwarg_is_non_default_and_nonsupported(self, kwarg):
if self.kwargs.get(kwarg, False) is not False:
raise NotImplementedError(
f"`{kwarg}` is not supported for ManagedVectorStore for now."
)


class InitArgsVerfier(ArgsVerifierBase):
_not_implemented_args = [
"dataset",
"embedding_function",
"creds",
"org_id",
]
Expand All @@ -447,6 +466,14 @@ def verify(self, cls):
"ManagedVectorStore can only be initialized with a Deep Lake Cloud path."
)

if (
self.kwargs.get("deep_memory", False) is False
and self.kwargs.get("embedding_function", None) is not None
):
raise NotImplementedError(

Check warning on line 473 in deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py

View check run for this annotation

Codecov / codecov/patch

deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py#L473

Added line #L473 was not covered by tests
"ManagedVectorStore does not support passing embedding_function for now."
)

if self.kwargs.get("other_kwargs", {}) != {}:
other_kwargs = self.kwargs["other_kwargs"]
other_kwargs_names = list(other_kwargs.keys())
Expand All @@ -459,8 +486,6 @@ def verify(self, cls):

class AddArgsVerfier(ArgsVerifierBase):
_not_implemented_args = [
"embedding_function",
"embedding_data",
"embedding_tensor",
]

Expand All @@ -486,11 +511,8 @@ class SearchArgsVerfier(ArgsVerifierBase):
def verify(self):
super().verify()
self._verify_filter_is_dictionary()

if self.kwargs.get("return_view", False) is not False:
raise NotImplementedError(
"return_view is not supported for the ManagedVectorStore."
)
self._verify_kwarg_is_non_default_and_nonsupported("return_view")
self._verify_kwarg_is_non_default_and_nonsupported("return_tql")


class UpdateArgsVerfier(ArgsVerifierBase):
Expand All @@ -516,6 +538,9 @@ def verify(self):

class DeleteByPathArgsVerfier(ArgsVerifierBase):
_not_implemented_args = [
"force",
"creds",
]

def verify(self):
super().verify()
self._verify_kwarg_is_not_false("force")
68 changes: 15 additions & 53 deletions deeplake/core/vectorstore/dataset_handlers/test_managed_dh.py
Expand Up @@ -18,16 +18,16 @@ def test_managed_vectorstore_should_not_accept_dataset_during_init(
)


def test_managed_vectorstore_should_not_accept_embedding_function_during_init(
hub_cloud_path, hub_cloud_dev_token
):
with pytest.raises(NotImplementedError):
VectorStore(
path=hub_cloud_path,
token=hub_cloud_dev_token,
runtime={"tensor_db": True},
embedding_function=lambda x: x,
)
# def test_managed_vectorstore_should_not_accept_embedding_function_during_init(
# hub_cloud_path, hub_cloud_dev_token
# ):
# with pytest.raises(NotImplementedError):
# VectorStore(
# path=hub_cloud_path,
# token=hub_cloud_dev_token,
# runtime={"tensor_db": True},
# embedding_function=lambda x: x,
# )


def test_managed_vectorstore_should_not_accept_exec_option_during_init(
Expand Down Expand Up @@ -240,10 +240,10 @@ def test_managed_vectorstore_should_not_accept_exec_option_during_update_embeddi
path=hub_cloud_path,
token=hub_cloud_dev_token,
runtime={"tensor_db": True},
embedding_dim=100,
embedding_dim=3,
)

embedding_dict = {"embedding": [np.zeros(100, dtype=np.float32)] * 3}
embedding_dict = {"embedding": [[0]] * 3}

with pytest.raises(NotImplementedError):
db.update_embedding(

Check failure on line 249 in deeplake/core/vectorstore/dataset_handlers/test_managed_dh.py

View workflow job for this annotation

GitHub Actions / JUnit Test Report

test_managed_dh.test_managed_vectorstore_should_not_accept_exec_option_during_update_embedding

ValueError: Item 'None' of type 'NoneType' is not a valid key.
Raw output
hub_cloud_path = 'hub://testingacc2/tmp8cb0_test_managed_dh_test_managed_vectorstore_should_not_accept_exec_option_during_update_embedding'
hub_cloud_dev_token = 'eyJhbGciOiJIUzUxMiIsImlhdCI6MTcwMzc5MjAzOSwiZXhwIjoxNzA3MzkyMDM5fQ.eyJpZCI6InRlc3RpbmdhY2MyIn0.P37ZBtPnnDHh-ybydziEFLWyJ23ky1pfONZbiJ-sUOTcXjCmvUrTFufREagLU2ecR_WSiPhUVTPr9WPAXr0AtQ'

    def test_managed_vectorstore_should_not_accept_exec_option_during_update_embedding(
        hub_cloud_path, hub_cloud_dev_token
    ):
        db = utils.create_and_populate_vs(
            path=hub_cloud_path,
            token=hub_cloud_dev_token,
            runtime={"tensor_db": True},
            embedding_dim=3,
        )
    
        embedding_dict = {"embedding": [[0]] * 3}
    
        with pytest.raises(NotImplementedError):
>           db.update_embedding(
                embedding_dict=embedding_dict,
                embedding_source_tensor="text",
                embedding_tensor="embedding",
            )

deeplake/core/vectorstore/dataset_handlers/test_managed_dh.py:249: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
deeplake/core/vectorstore/deeplake_vectorstore.py:442: in update_embedding
    self.dataset_handler.update_embedding(
deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py:346: in update_embedding
    self.client.vectorstore_update_embeddings(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <deeplake.client.managed.managed_client.ManagedServiceClient object at 0x7f0cefefddd0>
path = 'hub://testingacc2/tmp8cb0_test_managed_dh_test_managed_vectorstore_should_not_accept_exec_option_during_update_embedding'
row_ids = None, ids = None, filter = None, query = None
embedding_function = None, embedding_source_tensor = None
embedding_tensor = 'embedding', embedding_dict = {'embedding': [[0], [0], [0]]}

    def vectorstore_update_embeddings(
        self,
        path: str,
        row_ids: List[str],
        ids: List[str],
        filter: Union[Dict, Callable],
        query: str,
        embedding_function: Union[Callable, List[Callable]] = None,
        embedding_source_tensor: Union[str, List[str]] = None,
        embedding_tensor: Union[str, List[str]] = None,
        embedding_dict: Optional[Dict[str, Union[List[float], List[float]]]] = None,
    ):
        response = self.request(
            method="POST",
            relative_url=VECTORSTORE_UPDATE_ROWS_SUFFIX,
            json={
                "dataset": path,
                "row_ids": row_ids,
                "ids": ids,
                "filter": filter,
                "query": query,
                "embedding_dict": embedding_dict,
            },
        )
        data = self._get_result_or_poll(response)
        data = data.get("result", {})
        error = data.get("error", None)
    
        if error is not None:
>           raise ValueError(error)
E           ValueError: Item 'None' of type 'NoneType' is not a valid key.

deeplake/client/managed/managed_client.py:245: ValueError
Expand All @@ -260,10 +260,10 @@ def test_managed_vectorstore_should_not_accept_embedding_function_during_update_
path=hub_cloud_path,
token=hub_cloud_dev_token,
runtime={"tensor_db": True},
embedding_dim=100,
embedding_dim=3,
)

embedding_dict = {"embedding": [np.zeros(100, dtype=np.float32)] * 3}
embedding_dict = {"embedding": [[0, 0, 0]] * 3}

with pytest.raises(NotImplementedError):
db.update_embedding(
Expand All @@ -272,44 +272,6 @@ def test_managed_vectorstore_should_not_accept_embedding_function_during_update_
)


# def test_managed_vectorstore_should_not_accept_embedding_source_tensor_during_update_embedding(
# hub_cloud_path, hub_cloud_dev_token
# ):
# db = utils.create_and_populate_vs(
# path=hub_cloud_path,
# token=hub_cloud_dev_token,
# runtime={"tensor_db": True},
# embedding_dim=100,
# )

# embedding_dict = {"embedding": [np.zeros(100, dtype=np.float32)] * 3}

# with pytest.raises(NotImplementedError):
# db.update_embedding(
# embedding_dict=embedding_dict,
# embedding_source_tensor="text",
# )


def test_managed_vectorstore_should_not_accept_embedding_tensor_during_update_embedding(
hub_cloud_path, hub_cloud_dev_token
):
db = utils.create_and_populate_vs(
path=hub_cloud_path,
token=hub_cloud_dev_token,
runtime={"tensor_db": True},
embedding_dim=100,
)

embedding_dict = {"embedding": [np.zeros(100, dtype=np.float32)] * 3}

with pytest.raises(NotImplementedError):
db.update_embedding(
embedding_dict=embedding_dict,
embedding_tensor="text",
)


def test_managed_vectorstore_should_not_accept_force_during_delete_by_path(
hub_cloud_path, hub_cloud_dev_token
):
Expand All @@ -320,7 +282,7 @@ def test_managed_vectorstore_should_not_accept_force_during_delete_by_path(
)

with pytest.raises(NotImplementedError):
db.delete_by_path(path=hub_cloud_path, force=True)
db.delete_by_path(path=hub_cloud_path, force=True, runtime={"tensor_db": True})

Check failure on line 285 in deeplake/core/vectorstore/dataset_handlers/test_managed_dh.py

View workflow job for this annotation

GitHub Actions / JUnit Test Report

test_managed_dh.test_managed_vectorstore_should_not_accept_force_during_delete_by_path

AttributeError: 'DeleteByPathArgsVerfier' object has no attribute '_verify_kwarg_is_not_false'
Raw output
hub_cloud_path = 'hub://testingacc2/tmp8cb0_test_managed_dh_test_managed_vectorstore_should_not_accept_force_during_delete_by_path'
hub_cloud_dev_token = 'eyJhbGciOiJIUzUxMiIsImlhdCI6MTcwMzc5MjAzOSwiZXhwIjoxNzA3MzkyMDM5fQ.eyJpZCI6InRlc3RpbmdhY2MyIn0.P37ZBtPnnDHh-ybydziEFLWyJ23ky1pfONZbiJ-sUOTcXjCmvUrTFufREagLU2ecR_WSiPhUVTPr9WPAXr0AtQ'

    def test_managed_vectorstore_should_not_accept_force_during_delete_by_path(
        hub_cloud_path, hub_cloud_dev_token
    ):
        db = VectorStore(
            path=hub_cloud_path,
            token=hub_cloud_dev_token,
            runtime={"tensor_db": True},
        )
    
        with pytest.raises(NotImplementedError):
>           db.delete_by_path(path=hub_cloud_path, force=True, runtime={"tensor_db": True})

deeplake/core/vectorstore/dataset_handlers/test_managed_dh.py:285: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
deeplake/core/vectorstore/deeplake_vectorstore.py:478: in delete_by_path
    ManagedDH.delete_by_path(
deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py:375: in delete_by_path
    ManagedVectorStoreArgsVerifier.verify_delete_by_path_args(
deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py:50: in verify_delete_by_path_args
    args_verifier.verify()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <deeplake.core.vectorstore.dataset_handlers.managed_dataset_handler.DeleteByPathArgsVerfier object at 0x7f0cefc2f8d0>

    def verify(self):
        super().verify()
>       self._verify_kwarg_is_not_false("force")
E       AttributeError: 'DeleteByPathArgsVerfier' object has no attribute '_verify_kwarg_is_not_false'

deeplake/core/vectorstore/dataset_handlers/managed_dataset_handler.py:546: AttributeError


def test_managed_vectorstore_should_not_accept_creds_during_delete_by_path(
Expand Down
24 changes: 21 additions & 3 deletions deeplake/core/vectorstore/deep_memory/deep_memory.py
Expand Up @@ -5,6 +5,7 @@
from pydantic import BaseModel, ValidationError
from typing import Any, Dict, Optional, List, Union, Callable, Tuple
from time import time
from tqdm import tqdm

import numpy as np

Expand Down Expand Up @@ -118,7 +119,7 @@ def __init__(
self.token = token
self.embedding_function = embedding_function
self.client = self._get_dm_client()
self.creds = creds or {}
self.creds = creds
self.logger = logger

@access_control
Expand Down Expand Up @@ -194,13 +195,30 @@ def train(
)

self.logger.info("Preparing training data for deepmemory:")

embedding = []

# TODO: after allowing to send embedding function to managed side, remove the following lines: 203-209
# internal batch size for embedding function
batch_size = 128
query_batches = [
queries[i : i + batch_size] for i in range(0, len(queries), 100)
]

for _, query in tqdm(enumerate(query_batches), total=len(query_batches)):
embedded_docs = embedding_function.embed_documents(query)
for idx, embedded_doc in enumerate(embedded_docs):
if isinstance(embedded_doc, np.ndarray):
embedded_docs[idx] = embedded_doc.tolist()

embedding.extend(embedded_docs)

queries_vs.add(
text=[query for query in queries],
metadata=[
{"relevance": relevance_per_doc} for relevance_per_doc in relevance
],
embedding_data=[query for query in queries],
embedding_function=embedding_function,
embedding=embedding,
)

# do some rest_api calls to train the model
Expand Down
21 changes: 11 additions & 10 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Expand Up @@ -1240,16 +1240,17 @@ def test_update_embedding(
embedding_function=embedding_fn,
)

# case 8-9: single embedding_source_tensor, multiple embedding_tensor, single init_embedding_function
with pytest.raises(ValueError):
# case 8: error out because embedding_function is not specified during init call and update call
vector_store.update_embedding(
ids=vector_store_hash_ids,
row_ids=vector_store_row_ids,
filter=vector_store_filters,
query=vector_store_query,
embedding_source_tensor=embedding_source_tensor,
)
if init_embedding_function is None:
# case 8-9: single embedding_source_tensor, multiple embedding_tensor, single init_embedding_function
with pytest.raises(ValueError):
# case 8: error out because embedding_function is not specified during init call and update call
vector_store.update_embedding(
ids=vector_store_hash_ids,
row_ids=vector_store_row_ids,
filter=vector_store_filters,
query=vector_store_query,
embedding_source_tensor=embedding_source_tensor,
)

# case 10: single embedding_source_tensor, multiple embedding_tensor, multiple embedding_function -> error out?
with pytest.raises(ValueError):
Expand Down
2 changes: 1 addition & 1 deletion deeplake/tests/path_fixtures.py
Expand Up @@ -480,7 +480,7 @@ def corpus_query_relevances_copy(request, hub_cloud_dev_token):

corpus = _get_storage_path(request, HUB_CLOUD)
query_vs = VectorStore(
path=f"hub://{HUB_CLOUD_DEV_USERNAME}/deepmemory_test_queries",
path=f"hub://{HUB_CLOUD_DEV_USERNAME}/deepmemory_test_queries2",
runtime={"tensor_db": True},
token=hub_cloud_dev_token,
)
Expand Down

0 comments on commit c0a76dd

Please sign in to comment.