Merge pull request #916 from activeloopai/task/2.0/append-api-updates

[2.0] Various API changes
activeloopai · Jun 4, 2021 · 92fe293 · 92fe293
2 parents 9783e0e + 0e17538
commit 92fe293
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 118 deletions.
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -8,10 +8,12 @@
 from hub.core.tensor import tensor_exists
 from hub.core.dataset import dataset_exists
 from hub.core.meta.dataset_meta import read_dataset_meta, write_dataset_meta
-from hub.core.meta.tensor_meta import tensor_meta_from_array
+from hub.core.meta.tensor_meta import default_tensor_meta
 
 from hub.core.typing import StorageProvider
 from hub.util.index import Index
+
+from hub.constants import DEFAULT_CHUNK_SIZE
 from hub.util.exceptions import (
     InvalidKeyTypeError,
     TensorAlreadyExistsError,
@@ -22,9 +24,6 @@
 from hub.util.path import storage_provider_from_path
 from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, MB
 
-# Used to distinguish between attributes and items (tensors)
-DATASET_RESERVED_ATTRIBUTES = ["path", "mode", "index", "storage", "tensors"]
-
 
 class Dataset:
     def __init__(
@@ -36,13 +35,7 @@ def __init__(
         local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
         storage: Optional[StorageProvider] = None,
     ):
-        """Initialize a new or existing dataset.
-
-        Note:
-            Entries of `DATASET_RESERVED_ATTRIBUTES` cannot be used as tensor names.
-            This is to distinguish between attributes (like `ds.mode`) and tensors.
-
-            Be sure to keep `DATASET_RESERVED_ATTRIBUTES` up-to-date when changing this class.
+        """Initializes a new or existing dataset.
 
         Args:
             path (str): The location of the dataset. Used to initialize the storage provider.
@@ -76,11 +69,10 @@ def __init__(
         self.tensors: Dict[str, Tensor] = {}
 
         if dataset_exists(self.storage):
-            ds_meta = read_dataset_meta(self.storage)
-            for tensor_name in ds_meta["tensors"]:
+            for tensor_name in self.meta["tensors"]:
                 self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
         else:
-            write_dataset_meta(self.storage, {"tensors": []})
+            self.meta = {"tensors": []}
 
     def __len__(self):
         """Return the greatest length of tensors"""
@@ -98,40 +90,61 @@ def __getitem__(self, item: Union[str, int, slice, Index]):
         else:
             raise InvalidKeyTypeError(item)
 
-    def __setitem__(self, item: Union[slice, str], value):
-        if isinstance(item, str):
-            tensor_key = item
-
-            if tensor_exists(tensor_key, self.storage):
-                raise TensorAlreadyExistsError(tensor_key)
-
-            if isinstance(value, np.ndarray):
-                tensor_meta = tensor_meta_from_array(value, batched=True)
-                ds_meta = read_dataset_meta(self.storage)
-                ds_meta["tensors"].append(tensor_key)
-                write_dataset_meta(self.storage, ds_meta)
-                tensor = Tensor(tensor_key, self.storage, tensor_meta=tensor_meta)
-                self.tensors[tensor_key] = tensor
-                tensor.append(value, batched=True)
-                return tensor
-            else:
-                raise UnsupportedTensorTypeError(item)
-        else:
-            raise InvalidKeyTypeError(item)
+    def create_tensor(
+        self,
+        name: str,
+        htype: Optional[str] = None,
+        chunk_size: Optional[int] = None,
+        dtype: Optional[str] = None,
+        extra_meta: Optional[dict] = None,
+    ):
+        """Creates a new tensor in a dataset.
 
-    __getattr__ = __getitem__
+        Args:
+            name (str): The name of the tensor to be created.
+            htype (str, optional): The class of data for the tensor.
+                The defaults for other parameters are determined in terms of this value.
+                For example, `htype="image"` would have `dtype` default to `uint8`.
+                These defaults can be overridden by explicitly passing any of the other parameters to this function.
+                May also modify the defaults for other parameters.
+            chunk_size (int, optional): The target size for chunks in this tensor.
+            dtype (str, optional): The data type to use for this tensor.
+                Will be overwritten when the first sample is added.
+            extra_meta (dict, optional): Any additional metadata to be added to the tensor.
 
-    def __setattr__(self, name: str, value):
-        """Set the named attribute on the dataset"""
-        if name in DATASET_RESERVED_ATTRIBUTES:
-            return super().__setattr__(name, value)
-        else:
-            return self.__setitem__(name, value)
+        Returns:
+            The new tensor, which can also be accessed by `self[name]`.
+
+        Raises:
+            TensorAlreadyExistsError: Duplicate tensors are not allowed.
+        """
+        if tensor_exists(name, self.storage):
+            raise TensorAlreadyExistsError(name)
+
+        ds_meta = self.meta
+        ds_meta["tensors"].append(name)
+        self.meta = ds_meta
+
+        tensor_meta = default_tensor_meta(htype, chunk_size, dtype, extra_meta)
+        tensor = Tensor(name, self.storage, tensor_meta=tensor_meta)
+        self.tensors[name] = tensor
+
+        return tensor
+
+    __getattr__ = __getitem__
 
     def __iter__(self):
         for i in range(len(self)):
             yield self[i]
 
+    @property
+    def meta(self):
+        return read_dataset_meta(self.storage)
+
+    @meta.setter
+    def meta(self, new_meta: dict):
+        write_dataset_meta(self.storage, new_meta)
+
     def flush(self):
         """Necessary operation after writes if caches are being used.
         Writes all the dirty data from the cache layers (if any) to the underlying storage.
@@ -156,10 +169,10 @@ def delete(self):
 
     @staticmethod
     def from_path(path: str):
-        """Create a local hub dataset from unstructured data.
+        """Creates a hub dataset from unstructured data.
 
         Note:
-            This copies the data locally in hub format.
+            This copies the data into hub format.
             Be careful when using this with large datasets.
 
         Args:

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Iterable
 import warnings
 
 import numpy as np
@@ -8,6 +8,7 @@
     add_samples_to_tensor,
     read_samples_from_tensor,
     read_tensor_meta,
+    write_tensor_meta,
     tensor_exists,
 )
 from hub.core.typing import StorageProvider
@@ -24,7 +25,7 @@ def __init__(
         tensor_meta: dict = None,
         index: Union[int, slice, Index] = None,
     ):
-        """Initialize a new tensor.
+        """Initializes a new tensor.
 
         Note:
             This operation does not create a new tensor in the storage provider,
@@ -56,54 +57,71 @@ def __init__(
                 raise TensorDoesNotExistError(self.key)
             create_tensor(self.key, self.storage, tensor_meta)
 
-    def append(self, array: np.ndarray, batched: bool):
-        # TODO: split into `append`/`extend`
-        add_samples_to_tensor(
-            array,
-            self.key,
-            storage=self.storage,
-            batched=batched,
-        )
+    def extend(self, array: Union[np.ndarray, Iterable[np.ndarray]]):
+        """Extends a tensor by appending multiple elements from an iterable.
+        Accepts an iterable of numpy arrays or a single batched numpy array.
+
+        Example:
+            >>> len(image)
+            0
+            >>> image.extend(np.zeros((100, 28, 28, 1)))
+            >>> len(image)
+            100
+
+        Args:
+            array: The data to add to the tensor.
+                The length should be equal to the number of samples to add.
+        """
+        if isinstance(array, np.ndarray):
+            add_samples_to_tensor(array, self.key, storage=self.storage, batched=True)
+        else:
+            for sample in array:
+                self.append(sample)
+
+    def append(self, array: np.ndarray):
+        """Appends a sample to the end of a tensor.
+
+        Example:
+            >>> len(image)
+            0
+            >>> image.append(np.zeros((28, 28, 1)))
+            >>> len(image)
+            1
+
+        Args:
+            array (np.ndarray): The data to add to the tensor.
+        """
+        add_samples_to_tensor(array, self.key, storage=self.storage, batched=False)
 
     @property
     def meta(self):
         return read_tensor_meta(self.key, self.storage)
 
+    @meta.setter
+    def meta(self, new_meta: dict):
+        write_tensor_meta(self.key, self.storage, new_meta)
+
     @property
     def shape(self):
         # TODO: when dynamic arrays are supported, handle `min_shape != max_shape` (right now they're always equal)
         return self.meta["max_shape"]
 
     def __len__(self):
-        """Return the length of the primary axis."""
+        """Returns the length of the primary axis of a tensor."""
         return self.meta["length"]
 
     def __getitem__(self, item: Union[int, slice, Index]):
         return Tensor(self.key, self.storage, index=self.index[item])
 
     def __setitem__(self, item: Union[int, slice], value: np.ndarray):
-        sliced_self = self[item]
-        if sliced_self.index.item != slice(None):
-            raise NotImplementedError(
-                "Assignment to Tensor subsections not currently supported!"
-            )
-        else:
-            if tensor_exists(self.key, self.storage):
-                raise TensorAlreadyExistsError(self.key)
-
-            add_samples_to_tensor(
-                array=value,
-                key=self.key,
-                storage=self.storage,
-                batched=True,
-            )
+        raise NotImplementedError("Tensor update not currently supported!")
 
     def __iter__(self):
         for i in range(len(self)):
             yield self[i]
 
     def numpy(self):
-        """Compute the contents of this tensor in numpy format.
+        """Computes the contents of a tensor in numpy format.
 
         Returns:
             A numpy array containing the data represented by this tensor.

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
@@ -12,8 +12,10 @@ def test_persist_local_flush(local_storage):
         pytest.skip()
 
     ds = Dataset(local_storage.root, local_cache_size=512)
-    ds.image = np.ones((4, 4096, 4096))
+    ds.create_tensor("image")
+    ds.image.extend(np.ones((4, 4096, 4096)))
     ds.flush()
+
     ds_new = Dataset(local_storage.root)
     assert len(ds_new) == 4
     assert ds_new.image.shape == (4096, 4096)
@@ -26,7 +28,8 @@ def test_persist_local_clear_cache(local_storage):
         pytest.skip()
 
     ds = Dataset(local_storage.root, local_cache_size=512)
-    ds.image = np.ones((4, 4096, 4096))
+    ds.create_tensor("image")
+    ds.image.extend(np.ones((4, 4096, 4096)))
     ds.clear_cache()
     ds_new = Dataset(local_storage.root)
     assert len(ds_new) == 4
@@ -37,21 +40,36 @@ def test_persist_local_clear_cache(local_storage):
 
 @parametrize_all_dataset_storages
 def test_populate_dataset(ds):
-    assert read_dataset_meta(ds.storage) == {"tensors": []}
-    ds.image = np.ones((4, 28, 28))
-    assert read_dataset_meta(ds.storage) == {"tensors": ["image"]}
+    assert ds.meta == {"tensors": []}
+    ds.create_tensor("image")
+    assert len(ds) == 0
+    assert len(ds.image) == 0
+
+    ds.image.extend(np.ones((4, 28, 28)))
     assert len(ds) == 4
+    assert len(ds.image) == 4
+
+    for _ in range(10):
+        ds.image.append(np.ones((28, 28)))
+    assert len(ds.image) == 14
+
+    ds.image.extend([np.ones((28, 28)), np.ones((28, 28))])
+    assert len(ds.image) == 16
+
+    assert ds.meta == {"tensors": ["image"]}
 
 
 @parametrize_all_dataset_storages
 def test_compute_tensor(ds):
-    ds.image = np.ones((32, 28, 28))
+    ds.create_tensor("image")
+    ds.image.extend(np.ones((32, 28, 28)))
     np.testing.assert_array_equal(ds.image.numpy(), np.ones((32, 28, 28)))
 
 
 @parametrize_all_dataset_storages
 def test_compute_tensor_slice(ds):
-    ds.image = np.vstack((np.arange(16),) * 8)
+    ds.create_tensor("image")
+    ds.image.extend(np.vstack((np.arange(16),) * 8))
 
     sliced_data = ds.image[2:5].numpy()
     expected_data = np.vstack((np.arange(16),) * 3)
@@ -61,8 +79,11 @@ def test_compute_tensor_slice(ds):
 @parametrize_all_dataset_storages
 def test_iterate_dataset(ds):
     labels = [1, 9, 7, 4]
-    ds.image = np.ones((4, 28, 28))
-    ds.label = np.asarray(labels).reshape((4, 1))
+    ds.create_tensor("image")
+    ds.create_tensor("label")
+
+    ds.image.extend(np.ones((4, 28, 28)))
+    ds.label.extend(np.asarray(labels).reshape((4, 1)))
 
     for idx, sub_ds in enumerate(ds):
         img = sub_ds.image.numpy()

diff --git a/hub/constants.py b/hub/constants.py
@@ -6,6 +6,8 @@
 MB = 1000 * KB
 GB = 1000 * MB
 
+DEFAULT_DTYPE = "float64"
+
 DEFAULT_CHUNK_SIZE = 16 * MB
 MIN_FIRST_CACHE_SIZE = 32 * MB
 MIN_SECOND_CACHE_SIZE = 160 * MB

diff --git a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
@@ -10,7 +10,7 @@
     add_samples_to_tensor,
     create_tensor,
 )
-from hub.core.meta.tensor_meta import tensor_meta_from_array
+from hub.core.meta.tensor_meta import default_tensor_meta
 from hub.core.tests.common import TENSOR_KEY
 from hub.core.typing import StorageProvider
 from hub.tests.common_benchmark import (
@@ -24,9 +24,7 @@
 def single_benchmark_write(info, key, arrays, chunk_size, storage, batched):
     actual_key = "%s_%i" % (key, info["iteration"])
 
-    create_tensor(
-        actual_key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size)
-    )
+    create_tensor(actual_key, storage, default_tensor_meta(chunk_size=chunk_size))
 
     for a_in in arrays:
         add_samples_to_tensor(

diff --git a/hub/core/compression/webp.py b/hub/core/compression/webp.py
@@ -46,9 +46,9 @@ def encode_single_image(self, image: np.ndarray) -> bytes:
             Encoded data.
         """
         with BytesIO() as buffer:
-            image = Image.fromarray(image)
-            image = image.convert("RGB")
-            image.save(buffer, format=self.codec_id, quality=self.quality)
+            img = Image.fromarray(image)
+            img = img.convert("RGB")
+            img.save(buffer, format=self.codec_id, quality=self.quality)
             return buffer.getvalue()
 
     def decode_single_image(self, buf: bytes, image_shape: tuple) -> np.ndarray: