Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New vectorstore docs #2644

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 4 additions & 8 deletions deeplake/core/vectorstore/deep_memory.py
Expand Up @@ -192,7 +192,6 @@ def status(self, job_id: str):

Examples:
>>> vectorstore.deep_memory.status(job_id)

--------------------------------------------------------------
| 6508464cd80cab681bfcfff3 |
--------------------------------------------------------------
Expand Down Expand Up @@ -281,7 +280,7 @@ def evaluate(
"""Evaluate a model on DeepMemory managed service.

Examples:
# Evaluate a model with embedding function
>>> #1. Evaluate a model with embedding function
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand All @@ -291,8 +290,7 @@ def evaluate(
... queries=queries,
... embedding_function=embedding_function,
... )

# Evaluate a model with precomputed embeddings
>>> #2. Evaluate a model with precomputed embeddings
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand All @@ -302,8 +300,7 @@ def evaluate(
... queries=queries,
... embedding=embedding,
... )

# Evaluate a model with precomputed embeddings and log queries
>>> #3. Evaluate a model with precomputed embeddings and log queries
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand All @@ -316,8 +313,7 @@ def evaluate(
... "log_queries": True,
... }
... )

# Evaluate a model with precomputed embeddings and log queries, and custom branch
>>> #4. Evaluate a model with precomputed embeddings and log queries, and custom branch
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand Down
15 changes: 0 additions & 15 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Expand Up @@ -64,13 +64,11 @@ def __init__(
>>> data = VectorStore(
... path = "./my_vector_store",
... )

>>> # Create a vector store in the Deep Lake Managed Tensor Database
>>> data = VectorStore(
... path = "hub://org_id/dataset_name",
... runtime = {"tensor_db": True},
... )

>>> # Create a vector store with custom tensors
>>> data = VectorStore(
... path = "./my_vector_store",
Expand Down Expand Up @@ -233,22 +231,19 @@ def add(
>>> metadatas = [{"timestamp": "01:20"}, {"timestamp": "01:22"}]
>>> emebdding_fn = lambda x: [[1, 2, 3]] * len(x)
>>> embedding_fn_2 = lambda x: [[4, 5]] * len(x)

>>> # Directly upload embeddings
>>> deeplake_vector_store.add(
... text = texts,
... embedding = embeddings,
... metadata = metadatas,
... )

>>> # Upload embedding via embedding function
>>> deeplake_vector_store.add(
... text = texts,
... metadata = metadatas,
... embedding_function = embedding_fn,
... embedding_data = texts,
... )

>>> # Upload embedding via embedding function to a user-defined embedding tensor
>>> deeplake_vector_store.add(
... text = texts,
Expand All @@ -257,22 +252,19 @@ def add(
... embedding_data = texts,
... embedding_tensor = "embedding_1",
... )

>>> # Multiple embedding functions (user defined embedding tensors must be specified)
>>> deeplake_vector_store.add(
... embedding_tensor = ["embedding_1", "embedding_2"]
... embedding_function = [embedding_fn, embedding_fn_2],
... embedding_data = [texts, texts],
... )

>>> # Alternative syntax for multiple embedding functions
>>> deeplake_vector_store.add(
... text = texts,
... metadata = metadatas,
... embedding_tensor_1 = (embedding_fn, texts),
... embedding_tensor_2 = (embedding_fn_2, texts),
... )

>>> # Add data to fully custom tensors
>>> deeplake_vector_store.add(
... tensor_A = [1, 2],
Expand Down Expand Up @@ -396,21 +388,18 @@ def search(
... embedding = [1, 2, 3],
... exec_option = "python",
... )

>>> # Search using an embedding function and data for embedding
>>> data = vector_store.search(
... embedding_data = "What does this chatbot do?",
... embedding_function = query_embedding_fn,
... exec_option = "compute_engine",
... )

>>> # Add a filter to your search
>>> data = vector_store.search(
... embedding = np.ones(3),
... exec_option = "python",
... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"},...}, # Only valid for exec_option = "python"
... )

>>> # Search using TQL
>>> data = vector_store.search(
... query = "select * where ..... <add TQL syntax>",
Expand Down Expand Up @@ -553,12 +542,10 @@ def delete(
Examples:
>>> # Delete using ids:
>>> data = vector_store.delete(ids)

>>> # Delete data using filter
>>> data = vector_store.delete(
... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}},
... )

>>> # Delete data using TQL
>>> data = vector_store.delete(
... query = "select * where ..... <add TQL syntax>",
Expand Down Expand Up @@ -649,7 +636,6 @@ def update_embedding(
... embedding_tensor = "embedding",
... embedding_function = embedding_function,
... )

>>> # Update data using filter and several embedding_tensors, several embedding_source_tensors
>>> # and several embedding_functions:
>>> data = vector_store.update(
Expand All @@ -658,7 +644,6 @@ def update_embedding(
... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}},
... embedding_tensor = ["text_embedding", "metadata_embedding"]
... )

>>> # Update data using TQL, if new embedding function is not specified the embedding_function used
>>> # during initialization will be used
>>> data = vector_store.update(
Expand Down
1 change: 1 addition & 0 deletions deeplake/core/vectorstore/test_deepmemory.py
Expand Up @@ -565,3 +565,4 @@ def test_deepmemory_search_on_local_datasets(
output = corpus.search(embedding=query_embedding, deep_memory=True, k=10)

assert correct_id in output["id"]
assert "score" in output
Expand Up @@ -4,6 +4,7 @@
from deeplake.core.vectorstore.vector_search.indra import query
from deeplake.core.vectorstore.vector_search import utils
from deeplake.core.dataset import Dataset as DeepLakeDataset
from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset
from deeplake.enterprise.util import raise_indra_installation_error


Expand Down Expand Up @@ -83,8 +84,9 @@ def search(
api.tql.prepare_deepmemory_metrics(indra_dataset)

indra_view = indra_dataset.query(tql_query)
indexes = indra_view.indexes
view = deeplake_dataset[indexes]

view = DeepLakeQueryDataset(deeplake_ds=deeplake_dataset, indra_ds=indra_view)
view._tql_query = tql_query

return_data = {}

Expand Down
158 changes: 158 additions & 0 deletions docs/source/deeplake.VectorStore.rst
@@ -0,0 +1,158 @@
deeplake.VectorStore
--------------------

.. autoclass:: deeplake.core.vectorstore.deeplake_vectorstore.VectorStore
:members:
:show-inheritance:

.. automethod:: __init__
:noindex:
:template: method

.. rubric:: Signature

.. code-block:: python

__init__(path: Union[str, pathlib.Path],
tensor_params: List[Dict[str, object]] = [
{'name': 'text', 'htype': 'text', ... },
{...},
{...},
{...}],
embedding_function: Optional[Callable] = None,
read_only: Optional[bool] = None,
ingestion_batch_size: int = 1000,
index_params: Optional[Dict[str, Union[int, str]]] = None,
num_workers: int = 0,
exec_option: str = 'auto',
token: Optional[str] = None,
overwrite: bool = False,
verbose: bool = True,
runtime: Optional[Dict] = None,
creds: Optional[Union[str, Dict]] = None,
org_id: Optional[str] = None,
logger: Logger = ...,
branch: str = 'main',
**kwargs: Any)

:param path: Path to the vector store.
:type path: Union[str, pathlib.Path]

:param tensor_params: Parameters for tensors with default configurations.
:type tensor_params: List[Dict[str, object]], optional

:param embedding_function: Function for embeddings. Default is None.
:type embedding_function: Optional[Callable], optional

:param read_only: Flag for read-only mode. Default is None.
:type read_only: Optional[bool], optional

:param ingestion_batch_size: Batch size for ingestion. Default is 1000.
:type ingestion_batch_size: int, optional

:param index_params: Parameters for indexing. Default is None.
:type index_params: Optional[Dict[str, Union[int, str]]], optional

:param num_workers: Number of workers. Default is 0.
:type num_workers: int, optional

:param exec_option: Execution option. Default is 'auto'.
:type exec_option: str, optional

:param token: Token for authentication. Default is None.
:type token: Optional[str], optional

:param overwrite: Flag to overwrite existing data. Default is False.
:type overwrite: bool, optional

:param verbose: Flag for verbose logging. Default is True.
:type verbose: bool, optional

:param runtime: Runtime configurations. Default is None.
:type runtime: Optional[Dict], optional

:param creds: Credentials for authentication. Default is None.
:type creds: Optional[Union[str, Dict]], optional

:param org_id: Organization ID. Default is None.
:type org_id: Optional[str], optional

:param logger: Logger object. Default provided.
:type logger: Logger, optional

:param branch: Branch name. Default is 'main'.
:type branch: str, optional

:param kwargs: Additional keyword arguments.
:type kwargs: Any, optional

.. automethod:: add
:noindex:
:template: method

.. rubric:: Signature

.. code-block:: python

add(embedding_function: Optional[Union[Callable, List[Callable]]] = None,
embedding_data: Optional[Union[List, List[List]]] = None,
embedding_tensor: Optional[Union[str, List[str]]] = None,
return_ids: bool = False,
rate_limiter: Dict = {'bytes_per_minute': 1800000.0, 'enabled': False},
batch_byte_size: int = 10000,
**tensors) → Optional[List[str]]

:param embedding_function: Embedding function(s). Default is None.
:type embedding_function: Optional[Union[Callable, List[Callable]]], optional

:param embedding_data: Data for embeddings. Default is None.
:type embedding_data: Optional[Union[List, List[List]]], optional

:param embedding_tensor: Name of the tensor(s) for embedding. Default is None.
:type embedding_tensor: Optional[Union[str, List[str]]], optional

:param return_ids: Flag to return IDs. Default is False.
:type return_ids: bool, optional

:param rate_limiter: Rate limiting configuration. Default provided.
:type rate_limiter: Dict, optional

:param batch_byte_size: Batch byte size. Default is 10000.
:type batch_byte_size: int, optional

:param tensors: Additional tensors.
:type tensors: Any, optional

.. automethod:: delete
:noindex:
:template: method

.. rubric:: Signature

.. code-block:: python

delete(row_ids: Optional[List[str]] = None,
ids: Optional[List[str]] = None,
filter: Optional[Union[Dict, Callable]] = None,
query: Optional[str] = None,
exec_option: Optional[str] = None,
delete_all: Optional[bool] = None) → bool

:param row_ids: Row IDs to delete. Default is None.
:type row_ids: Optional[List[str]], optional

:param ids: IDs to delete. Default is None.
:type ids: Optional[List[str]], optional

:param filter: Filter for rows to delete. Can be a dictionary or callable. Default is None.
:type filter: Optional[Union[Dict, Callable]], optional

:param query: Query to determine rows to delete. Default is None.
:type query: Optional[str], optional

:param exec_option: Execution option for deletion. Default is None.
:type exec_option: Optional[str], optional

:param delete_all: Flag to delete all entries. Default is None.
:type delete_all: Optional[bool], optional

1 change: 1 addition & 0 deletions docs/source/index.rst
Expand Up @@ -50,6 +50,7 @@ Deep Lake is an open-source database for AI.
:caption: API Reference

deeplake
deeplake.VectorStore
deeplake.core
deeplake.core.dataset
deeplake.core.tensor
Expand Down