Skip to content

Merge pull request #2832 from activeloopai/docs_fix #1550

Merge pull request #2832 from activeloopai/docs_fix

Merge pull request #2832 from activeloopai/docs_fix #1550

GitHub Actions / JUnit Test Report failed Apr 26, 2024 in 0s

22436 tests run, 11800 passed, 10632 skipped, 4 failed.

Annotations

Check failure on line 656 in deeplake/core/vectorstore/test_deeplake_vectorstore.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_deeplake_vectorstore.test_search_quantitative[COS]

assert False
 +  where False = all([True, True, True, False])
Raw output
distance_metric = 'COS'
hub_cloud_dev_token = 'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpZCI6InRlc3RpbmdhY2MyIiwiYXBpX2tleSI6IjU4Y0tLb1p6UE1BbThPU2RpbTRiZ2tBekhWekt1VUE3MFJpNTNyZUpKRTJuaiJ9.'

    @pytest.mark.slow
    @requires_libdeeplake
    @pytest.mark.parametrize("distance_metric", ["L1", "L2", "COS", "MAX"])
    def test_search_quantitative(distance_metric, hub_cloud_dev_token):
        """Test whether TQL and Python return the same results"""
        # initialize vector store object:
        vector_store = DeepLakeVectorStore(
            path="hub://testingacc2/vectorstore_test",
            read_only=True,
            token=hub_cloud_dev_token,
        )
    
        # use python implementation to search the data
        data_p = vector_store.search(
            embedding=query_embedding, exec_option="python", distance_metric=distance_metric
        )
    
        # use indra implementation to search the data
        data_ce = vector_store.search(
            embedding=query_embedding,
            exec_option="compute_engine",
            distance_metric=distance_metric,
        )
    
        assert len(data_p["score"]) == len(data_ce["score"])
>       assert all(
            [
                isclose(
                    data_p["score"][i],
                    data_ce["score"][i],
                    abs_tol=0.00001
                    * (abs(data_p["score"][i]) + abs(data_ce["score"][i]))
                    / 2,
                )
                for i in range(len(data_p["score"]))
            ]
        )
E       assert False
E        +  where False = all([True, True, True, False])

deeplake/core/vectorstore/test_deeplake_vectorstore.py:656: AssertionError

Check failure on line 726 in deeplake/core/vectorstore/test_deeplake_vectorstore.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_deeplake_vectorstore.test_search_managed

assert False
 +  where False = all([True, True, True, False])
Raw output
hub_cloud_dev_token = 'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpZCI6InRlc3RpbmdhY2MyIiwiYXBpX2tleSI6IjU4Y0tLb1p6UE1BbThPU2RpbTRiZ2tBekhWekt1VUE3MFJpNTNyZUpKRTJuaiJ9.'

    @requires_libdeeplake
    @pytest.mark.slow
    def test_search_managed(hub_cloud_dev_token):
        """Test whether managed TQL and client-side TQL return the same results"""
        # initialize vector store object:
        vector_store = DeepLakeVectorStore(
            path="hub://testingacc2/vectorstore_test_managed",
            read_only=True,
            token=hub_cloud_dev_token,
        )
    
        # use indra implementation to search the data
        data_ce = vector_store.search(
            embedding=query_embedding,
            exec_option="compute_engine",
        )
    
        data_db = vector_store.search(
            embedding=query_embedding,
            exec_option="tensor_db",
        )
    
        assert "vectordb/" in vector_store.dataset_handler.dataset.base_storage.path
    
        assert len(data_ce["score"]) == len(data_db["score"])
>       assert all(
            [
                isclose(
                    data_ce["score"][i],
                    data_db["score"][i],
                    abs_tol=0.00001
                    * (abs(data_ce["score"][i]) + abs(data_db["score"][i]))
                    / 2,
                )
                for i in range(len(data_ce["score"]))
            ]
        )
E       assert False
E        +  where False = all([True, True, True, False])

deeplake/core/vectorstore/test_deeplake_vectorstore.py:726: AssertionError

Check failure on line 222 in deeplake/api/tests/test_json.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_json.test_json_transform[None-gdrive_ds]

deeplake.util.exceptions.TransformError: Transform failed. See traceback for more details.
Raw output
self = <deeplake.core.transform.transform.Pipeline object at 0x0000022ECBAC2700>
data_in = [{'x': [1, 2, 3], 'y': [4, [5, 6]]}, {'x': [1, 2, 3], 'y': [4, {'z': [0.1, 0.2, []]}]}, ['a', ['b', 'c'], {'d': 1.0}], [1.0, 2.0, 3.0, 4.0], ['a', 'b', 'c', 'd'], 1, ...]
ds_out = Dataset(path='gdrive://hubtest/tmp63f6', tensors=['json'])
num_workers = 2, scheduler = 'threaded', progressbar = True, skip_ok = False
check_lengths = True, pad_data_in = False, read_only_ok = False, cache_size = 16
checkpoint_interval = 0, ignore_errors = False, verbose = True, kwargs = {}
overwrite = False
original_data_in = [{'x': [1, 2, 3], 'y': [4, [5, 6]]}, {'x': [1, 2, 3], 'y': [4, {'z': [0.1, 0.2, []]}]}, ['a', ['b', 'c'], {'d': 1.0}], [1.0, 2.0, 3.0, 4.0], ['a', 'b', 'c', 'd'], 1, ...]
initial_padding_state = None
target_ds = Dataset(path='gdrive://hubtest/tmp63f6', tensors=['json'])
compute_provider = <deeplake.core.compute.thread.ThreadProvider object at 0x0000022ECBAC2880>
compute_id = '3f8d9560b8444a7092d16ab2027d555b', initial_autoflush = True

    def eval(
        self,
        data_in,
        ds_out: Optional[deeplake.Dataset] = None,
        num_workers: int = 0,
        scheduler: str = "threaded",
        progressbar: bool = True,
        skip_ok: bool = False,
        check_lengths: bool = True,
        pad_data_in: bool = False,
        read_only_ok: bool = False,
        cache_size: int = DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE,
        checkpoint_interval: int = 0,
        ignore_errors: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        """Evaluates the pipeline on ``data_in`` to produce an output dataset ``ds_out``.
    
        Args:
            data_in: Input passed to the transform to generate output dataset. Should support \__getitem__ and \__len__. Can be a Deep Lake dataset.
            ds_out (Dataset, optional): - The dataset object to which the transform will get written. If this is not provided, ``data_in`` will be overwritten if it is a Deep Lake dataset, otherwise error will be raised.
                - It should have all keys being generated in output already present as tensors. It's initial state should be either:
                - **Empty**, i.e., all tensors have no samples. In this case all samples are added to the dataset.
                - **All tensors are populated and have same length.** In this case new samples are appended to the dataset.
            num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
                Defaults to 'threaded'.
            progressbar (bool): Displays a progress bar if ``True`` (default).
            skip_ok (bool): If ``True``, skips the check for output tensors generated. This allows the user to skip certain tensors in the function definition.
                This is especially useful for inplace transformations in which certain tensors are not modified. Defaults to ``False``.
            check_lengths (bool): If ``True``, checks whether ``ds_out`` has tensors of same lengths initially.
            pad_data_in (bool): If ``True``, pads tensors of ``data_in`` to match the length of the largest tensor in ``data_in``.
                Defaults to ``False``.
            read_only_ok (bool): If ``True`` and output dataset is same as input dataset, the read-only check is skipped.
                Defaults to False.
            cache_size (int): Cache size to be used by transform per worker.
            checkpoint_interval (int): If > 0, the transform will be checkpointed with a commit every ``checkpoint_interval`` input samples to avoid restarting full transform due to intermitten failures. If the transform is interrupted, the intermediate data is deleted and the dataset is reset to the last commit.
                If <= 0, no checkpointing is done. Checkpoint interval should be a multiple of num_workers if num_workers > 0. Defaults to 0.
            ignore_errors (bool): If ``True``, input samples that causes transform to fail will be skipped and the errors will be ignored **if possible**.
            verbose (bool): If ``True``, prints additional information about the transform.
            **kwargs: Additional arguments.
    
        Raises:
            InvalidInputDataError: If ``data_in`` passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``data_in`` will also raise this.
            InvalidOutputDatasetError: If all the tensors of ``ds_out`` passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``ds_out`` will also raise this.
            TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
            TransformError: All other exceptions raised if there are problems while running the pipeline.
            ValueError: If ``num_workers`` > 0 and ``checkpoint_interval`` is not a multiple of ``num_workers`` or if ``checkpoint_interval`` > 0 and ds_out is None.
    
    
        # noqa: DAR401
    
        Example::
    
            @deeplake.compute
            def my_fn(sample_in: Any, samples_out, my_arg0, my_arg1=0):
                samples_out.my_tensor.append(my_arg0 * my_arg1)
    
            # This transform can be used using the eval method in one of these 2 ways:-
    
            # Directly evaluating the method
            # here arg0 and arg1 correspond to the 3rd and 4th argument in my_fn
            my_fn(arg0, arg1).eval(data_in, ds_out, scheduler="threaded", num_workers=5)
    
            # As a part of a Transform pipeline containing other functions
            pipeline = deeplake.compose([my_fn(a, b), another_function(x=2)])
            pipeline.eval(data_in, ds_out, scheduler="processed", num_workers=2)
    
        Note:
            ``pad_data_in`` is only applicable if ``data_in`` is a Deep Lake dataset.
    
        """
        num_workers, scheduler = sanitize_workers_scheduler(num_workers, scheduler)
        overwrite = ds_out is None
        deeplake_reporter.feature_report(
            feature_name="eval",
            parameters={"Num_Workers": str(num_workers), "Scheduler": scheduler},
        )
        check_transform_data_in(data_in, scheduler)
    
        data_in, original_data_in, initial_padding_state = prepare_data_in(
            data_in, pad_data_in, overwrite
        )
        target_ds = data_in if overwrite else ds_out
    
        check_transform_ds_out(
            target_ds, scheduler, check_lengths, read_only_ok and overwrite
        )
    
        # if overwrite then we've already flushed and autocheckecked out data_in which is target_ds now
        if not overwrite:
            target_ds.flush()
            auto_checkout(target_ds)
    
        compute_provider = get_compute_provider(scheduler, num_workers)
        compute_id = str(uuid4().hex)
        target_ds._send_compute_progress(compute_id=compute_id, start=True, progress=0)
    
        initial_autoflush = target_ds.storage.autoflush
        target_ds.storage.autoflush = False
    
        if not check_lengths or read_only_ok:
            skip_ok = True
    
        checkpointing_enabled = checkpoint_interval > 0
        total_samples = len_data_in(data_in)
        if checkpointing_enabled:
            check_checkpoint_interval(
                data_in,
                checkpoint_interval,
                num_workers,
                overwrite,
                verbose,
            )
            datas_in = [
                data_in[i : i + checkpoint_interval]
                for i in range(0, len_data_in(data_in), checkpoint_interval)
            ]
    
        else:
            datas_in = [data_in]
    
        samples_processed = 0
        desc = get_pbar_description(self.functions)
        if progressbar:
            pbar = get_progress_bar(len_data_in(data_in), desc)
            pqueue = compute_provider.create_queue()
        else:
            pbar, pqueue = None, None
        try:
            desc = desc.split()[1]
            completed = False
            progress = 0.0
            for data_in in datas_in:
                if checkpointing_enabled:
                    target_ds._commit(
                        f"Auto-commit during deeplake.compute of {desc} after {progress}% progress",
                        None,
                        False,
                        is_checkpoint=True,
                        total_samples_processed=samples_processed,
                    )
                progress = round(
                    (samples_processed + len_data_in(data_in)) / total_samples * 100, 2
                )
                end = progress == 100
                progress_args = {
                    "compute_id": compute_id,
                    "progress": progress,
                    "end": end,
                }
    
                try:
>                   self.run(
                        data_in,
                        target_ds,
                        compute_provider,
                        num_workers,
                        scheduler,
                        progressbar,
                        overwrite,
                        skip_ok,
                        read_only_ok and overwrite,
                        cache_size,
                        pbar,
                        pqueue,
                        ignore_errors,
                        **kwargs,
                    )

deeplake\core\transform\transform.py:288: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
deeplake\core\transform\transform.py:430: in run
    result = compute.map_with_progress_bar(
deeplake\core\compute\provider.py:62: in map_with_progress_bar
    result = self.map(sub_func, iterable)
deeplake\core\compute\thread.py:13: in map
    return self.pool.map(func, iterable)
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\pathos\threading.py:144: in map
    return _pool.map(star(f), zip(*args), **kwds)
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\multiprocess\pool.py:364: in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\multiprocess\pool.py:771: in get
    raise self._value
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\multiprocess\pool.py:125: in worker
    result = (True, func(*args, **kwds))
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\multiprocess\pool.py:48: in mapstar
    return list(map(*args))
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\pathos\helpers\mp_helper.py:15: in <lambda>
    func = lambda args: f(*args)
deeplake\core\compute\provider.py:57: in sub_func
    return func(pg_callback, *args, **kwargs)
deeplake\util\transform.py:345: in store_data_slice_with_pbar
    all_chunk_engines = create_worker_chunk_engines(
deeplake\util\transform.py:439: in create_worker_chunk_engines
    storage_chunk_engine = ChunkEngine(tensor, storage_cache, version_state)
deeplake\core\chunk_engine.py:230: in __init__
    tensor_meta = self.tensor_meta
deeplake\core\chunk_engine.py:321: in tensor_meta
    self._tensor_meta = self.meta_cache.get_deeplake_object(key, TensorMeta)
deeplake\core\storage\lru_cache.py:166: in get_deeplake_object
    item = self[path]
deeplake\core\storage\lru_cache.py:217: in __getitem__
    result = self.next_storage[path]
deeplake\core\storage\google_drive.py:287: in __getitem__
    return self.get_object_by_id(id)
deeplake\core\storage\google_drive.py:277: in get_object_by_id
    status, done = downloader.next_chunk()
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\googleapiclient\_helpers.py:131: in positional_wrapper
    return wrapped(*args, **kwargs)
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\googleapiclient\http.py:740: in next_chunk
    resp, content = _retry_request(
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\googleapiclient\http.py:221: in _retry_request
    raise exception
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\googleapiclient\http.py:190: in _retry_request
    resp, content = http.request(uri, method, *args, **kwargs)
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\google_auth_httplib2.py:218: in request
    response, content = self.http.request(
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\httplib2\__init__.py:1724: in request
    (response, content) = self._request(
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\httplib2\__init__.py:1444: in _request
    (response, content) = self._conn_request(conn, request_uri, method, body, headers)
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\site-packages\httplib2\__init__.py:1396: in _conn_request
    response = conn.getresponse()
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\http\client.py:1344: in getresponse
    response.begin()
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\http\client.py:307: in begin
    version, status, reason = self._read_status()
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\http\client.py:268: in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\socket.py:669: in readinto
    return self._sock.recv_into(b)
c:\hostedtoolcache\windows\python\3.8.10\x64\lib\ssl.py:1241: in recv_into
    return self.read(nbytes, buffer)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <ssl.SSLSocket [closed] fd=-1, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0>
len = 8192, buffer = <memory at 0x0000022EC95E8280>

    def read(self, len=1024, buffer=None):
        """Read up to LEN bytes and return them.
        Return zero-length string on EOF."""
    
        self._checkClosed()
        if self._sslobj is None:
            raise ValueError("Read on closed or unwrapped SSL socket.")
        try:
            if buffer is not None:
>               return self._sslobj.read(len, buffer)
E               socket.timeout: The read operation timed out

c:\hostedtoolcache\windows\python\3.8.10\x64\lib\ssl.py:1099: timeout

The above exception was the direct cause of the following exception:

ds = Dataset(path='gdrive://hubtest/tmp63f6', tensors=['json'])
compression = None, scheduler = 'threaded'

    @enabled_non_gcs_datasets
    @pytest.mark.parametrize("compression", ["lz4", None])
    @pytest.mark.slow
    def test_json_transform(ds, compression, scheduler="threaded"):
        ds.create_tensor("json", htype="json", sample_compression=compression)
    
        items = [
            {"x": [1, 2, 3], "y": [4, [5, 6]]},
            {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]},
            ["a", ["b", "c"], {"d": 1.0}],
            [1.0, 2.0, 3.0, 4.0],
            ["a", "b", "c", "d"],
            1,
            5.0,
            True,
            False,
            None,
        ] * 5
    
        expected = [*items[:9], {}] * 5
    
        @deeplake.compute
        def upload(stuff, ds):
            ds.json.append(stuff)
            return ds
    
>       upload().eval(items, ds, num_workers=2, scheduler=scheduler)

deeplake\api\tests\test_json.py:222: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
deeplake\core\transform\transform.py:105: in eval
    pipeline.eval(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <deeplake.core.transform.transform.Pipeline object at 0x0000022ECBAC2700>
data_in = [{'x': [1, 2, 3], 'y': [4, [5, 6]]}, {'x': [1, 2, 3], 'y': [4, {'z': [0.1, 0.2, []]}]}, ['a', ['b', 'c'], {'d': 1.0}], [1.0, 2.0, 3.0, 4.0], ['a', 'b', 'c', 'd'], 1, ...]
ds_out = Dataset(path='gdrive://hubtest/tmp63f6', tensors=['json'])
num_workers = 2, scheduler = 'threaded', progressbar = True, skip_ok = False
check_lengths = True, pad_data_in = False, read_only_ok = False, cache_size = 16
checkpoint_interval = 0, ignore_errors = False, verbose = True, kwargs = {}
overwrite = False
original_data_in = [{'x': [1, 2, 3], 'y': [4, [5, 6]]}, {'x': [1, 2, 3], 'y': [4, {'z': [0.1, 0.2, []]}]}, ['a', ['b', 'c'], {'d': 1.0}], [1.0, 2.0, 3.0, 4.0], ['a', 'b', 'c', 'd'], 1, ...]
initial_padding_state = None
target_ds = Dataset(path='gdrive://hubtest/tmp63f6', tensors=['json'])
compute_provider = <deeplake.core.compute.thread.ThreadProvider object at 0x0000022ECBAC2880>
compute_id = '3f8d9560b8444a7092d16ab2027d555b', initial_autoflush = True

    def eval(
        self,
        data_in,
        ds_out: Optional[deeplake.Dataset] = None,
        num_workers: int = 0,
        scheduler: str = "threaded",
        progressbar: bool = True,
        skip_ok: bool = False,
        check_lengths: bool = True,
        pad_data_in: bool = False,
        read_only_ok: bool = False,
        cache_size: int = DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE,
        checkpoint_interval: int = 0,
        ignore_errors: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        """Evaluates the pipeline on ``data_in`` to produce an output dataset ``ds_out``.
    
        Args:
            data_in: Input passed to the transform to generate output dataset. Should support \__getitem__ and \__len__. Can be a Deep Lake dataset.
            ds_out (Dataset, optional): - The dataset object to which the transform will get written. If this is not provided, ``data_in`` will be overwritten if it is a Deep Lake dataset, otherwise error will be raised.
                - It should have all keys being generated in output already present as tensors. It's initial state should be either:
                - **Empty**, i.e., all tensors have no samples. In this case all samples are added to the dataset.
                - **All tensors are populated and have same length.** In this case new samples are appended to the dataset.
            num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
                Defaults to 'threaded'.
            progressbar (bool): Displays a progress bar if ``True`` (default).
            skip_ok (bool): If ``True``, skips the check for output tensors generated. This allows the user to skip certain tensors in the function definition.
                This is especially useful for inplace transformations in which certain tensors are not modified. Defaults to ``False``.
            check_lengths (bool): If ``True``, checks whether ``ds_out`` has tensors of same lengths initially.
            pad_data_in (bool): If ``True``, pads tensors of ``data_in`` to match the length of the largest tensor in ``data_in``.
                Defaults to ``False``.
            read_only_ok (bool): If ``True`` and output dataset is same as input dataset, the read-only check is skipped.
                Defaults to False.
            cache_size (int): Cache size to be used by transform per worker.
            checkpoint_interval (int): If > 0, the transform will be checkpointed with a commit every ``checkpoint_interval`` input samples to avoid restarting full transform due to intermitten failures. If the transform is interrupted, the intermediate data is deleted and the dataset is reset to the last commit.
                If <= 0, no checkpointing is done. Checkpoint interval should be a multiple of num_workers if num_workers > 0. Defaults to 0.
            ignore_errors (bool): If ``True``, input samples that causes transform to fail will be skipped and the errors will be ignored **if possible**.
            verbose (bool): If ``True``, prints additional information about the transform.
            **kwargs: Additional arguments.
    
        Raises:
            InvalidInputDataError: If ``data_in`` passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``data_in`` will also raise this.
            InvalidOutputDatasetError: If all the tensors of ``ds_out`` passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``ds_out`` will also raise this.
            TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
            TransformError: All other exceptions raised if there are problems while running the pipeline.
            ValueError: If ``num_workers`` > 0 and ``checkpoint_interval`` is not a multiple of ``num_workers`` or if ``checkpoint_interval`` > 0 and ds_out is None.
    
    
        # noqa: DAR401
    
        Example::
    
            @deeplake.compute
            def my_fn(sample_in: Any, samples_out, my_arg0, my_arg1=0):
                samples_out.my_tensor.append(my_arg0 * my_arg1)
    
            # This transform can be used using the eval method in one of these 2 ways:-
    
            # Directly evaluating the method
            # here arg0 and arg1 correspond to the 3rd and 4th argument in my_fn
            my_fn(arg0, arg1).eval(data_in, ds_out, scheduler="threaded", num_workers=5)
    
            # As a part of a Transform pipeline containing other functions
            pipeline = deeplake.compose([my_fn(a, b), another_function(x=2)])
            pipeline.eval(data_in, ds_out, scheduler="processed", num_workers=2)
    
        Note:
            ``pad_data_in`` is only applicable if ``data_in`` is a Deep Lake dataset.
    
        """
        num_workers, scheduler = sanitize_workers_scheduler(num_workers, scheduler)
        overwrite = ds_out is None
        deeplake_reporter.feature_report(
            feature_name="eval",
            parameters={"Num_Workers": str(num_workers), "Scheduler": scheduler},
        )
        check_transform_data_in(data_in, scheduler)
    
        data_in, original_data_in, initial_padding_state = prepare_data_in(
            data_in, pad_data_in, overwrite
        )
        target_ds = data_in if overwrite else ds_out
    
        check_transform_ds_out(
            target_ds, scheduler, check_lengths, read_only_ok and overwrite
        )
    
        # if overwrite then we've already flushed and autocheckecked out data_in which is target_ds now
        if not overwrite:
            target_ds.flush()
            auto_checkout(target_ds)
    
        compute_provider = get_compute_provider(scheduler, num_workers)
        compute_id = str(uuid4().hex)
        target_ds._send_compute_progress(compute_id=compute_id, start=True, progress=0)
    
        initial_autoflush = target_ds.storage.autoflush
        target_ds.storage.autoflush = False
    
        if not check_lengths or read_only_ok:
            skip_ok = True
    
        checkpointing_enabled = checkpoint_interval > 0
        total_samples = len_data_in(data_in)
        if checkpointing_enabled:
            check_checkpoint_interval(
                data_in,
                checkpoint_interval,
                num_workers,
                overwrite,
                verbose,
            )
            datas_in = [
                data_in[i : i + checkpoint_interval]
                for i in range(0, len_data_in(data_in), checkpoint_interval)
            ]
    
        else:
            datas_in = [data_in]
    
        samples_processed = 0
        desc = get_pbar_description(self.functions)
        if progressbar:
            pbar = get_progress_bar(len_data_in(data_in), desc)
            pqueue = compute_provider.create_queue()
        else:
            pbar, pqueue = None, None
        try:
            desc = desc.split()[1]
            completed = False
            progress = 0.0
            for data_in in datas_in:
                if checkpointing_enabled:
                    target_ds._commit(
                        f"Auto-commit during deeplake.compute of {desc} after {progress}% progress",
                        None,
                        False,
                        is_checkpoint=True,
                        total_samples_processed=samples_processed,
                    )
                progress = round(
                    (samples_processed + len_data_in(data_in)) / total_samples * 100, 2
                )
                end = progress == 100
                progress_args = {
                    "compute_id": compute_id,
                    "progress": progress,
                    "end": end,
                }
    
                try:
                    self.run(
                        data_in,
                        target_ds,
                        compute_provider,
                        num_workers,
                        scheduler,
                        progressbar,
                        overwrite,
                        skip_ok,
                        read_only_ok and overwrite,
                        cache_size,
                        pbar,
                        pqueue,
                        ignore_errors,
                        **kwargs,
                    )
                    target_ds._send_compute_progress(**progress_args, status="success")
                    samples_processed += len_data_in(data_in)
                    completed = end
                except Exception as e:
                    if checkpointing_enabled:
                        print(
                            "Transform failed. Resetting back to last committed checkpoint."
                        )
                        target_ds.reset(force=True)
                    target_ds._send_compute_progress(**progress_args, status="failed")
                    index, sample, suggest = None, None, False
                    if isinstance(e, TransformError):
                        index, sample, suggest = e.index, e.sample, e.suggest
                        if checkpointing_enabled and isinstance(index, int):
                            index = samples_processed + index
                        e = e.__cause__  # type: ignore
                    if isinstance(e, AllSamplesSkippedError):
                        raise e
>                   raise TransformError(
                        index=index,
                        sample=sample,
                        samples_processed=samples_processed,
                        suggest=suggest,
                    ) from e
E                   deeplake.util.exceptions.TransformError: Transform failed. See traceback for more details.

deeplake\core\transform\transform.py:322: TransformError

Check failure on line 726 in deeplake/core/vectorstore/test_deeplake_vectorstore.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_deeplake_vectorstore.test_search_managed

assert False
 +  where False = all([True, True, True, False])
Raw output
hub_cloud_dev_token = 'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpZCI6InRlc3RpbmdhY2MyIiwiYXBpX2tleSI6IjU4Y0tLb1p6UE1BbThPU2RpbTRiZ2tBekhWekt1VUE3MFJpNTNyZUpKRTJuaiJ9.'

    @requires_libdeeplake
    @pytest.mark.slow
    def test_search_managed(hub_cloud_dev_token):
        """Test whether managed TQL and client-side TQL return the same results"""
        # initialize vector store object:
        vector_store = DeepLakeVectorStore(
            path="hub://testingacc2/vectorstore_test_managed",
            read_only=True,
            token=hub_cloud_dev_token,
        )
    
        # use indra implementation to search the data
        data_ce = vector_store.search(
            embedding=query_embedding,
            exec_option="compute_engine",
        )
    
        data_db = vector_store.search(
            embedding=query_embedding,
            exec_option="tensor_db",
        )
    
        assert "vectordb/" in vector_store.dataset_handler.dataset.base_storage.path
    
        assert len(data_ce["score"]) == len(data_db["score"])
>       assert all(
            [
                isclose(
                    data_ce["score"][i],
                    data_db["score"][i],
                    abs_tol=0.00001
                    * (abs(data_ce["score"][i]) + abs(data_db["score"][i]))
                    / 2,
                )
                for i in range(len(data_ce["score"]))
            ]
        )
E       assert False
E        +  where False = all([True, True, True, False])

deeplake/core/vectorstore/test_deeplake_vectorstore.py:726: AssertionError