Skip to content

Commit

Permalink
Merge pull request #916 from activeloopai/task/2.0/append-api-updates
Browse files Browse the repository at this point in the history
[2.0] Various API changes
  • Loading branch information
benchislett committed Jun 4, 2021
2 parents 9783e0e + 0e17538 commit 92fe293
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 118 deletions.
99 changes: 56 additions & 43 deletions hub/api/dataset.py
Expand Up @@ -8,10 +8,12 @@
from hub.core.tensor import tensor_exists
from hub.core.dataset import dataset_exists
from hub.core.meta.dataset_meta import read_dataset_meta, write_dataset_meta
from hub.core.meta.tensor_meta import tensor_meta_from_array
from hub.core.meta.tensor_meta import default_tensor_meta

from hub.core.typing import StorageProvider
from hub.util.index import Index

from hub.constants import DEFAULT_CHUNK_SIZE
from hub.util.exceptions import (
InvalidKeyTypeError,
TensorAlreadyExistsError,
Expand All @@ -22,9 +24,6 @@
from hub.util.path import storage_provider_from_path
from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, MB

# Used to distinguish between attributes and items (tensors)
DATASET_RESERVED_ATTRIBUTES = ["path", "mode", "index", "storage", "tensors"]


class Dataset:
def __init__(
Expand All @@ -36,13 +35,7 @@ def __init__(
local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
storage: Optional[StorageProvider] = None,
):
"""Initialize a new or existing dataset.
Note:
Entries of `DATASET_RESERVED_ATTRIBUTES` cannot be used as tensor names.
This is to distinguish between attributes (like `ds.mode`) and tensors.
Be sure to keep `DATASET_RESERVED_ATTRIBUTES` up-to-date when changing this class.
"""Initializes a new or existing dataset.
Args:
path (str): The location of the dataset. Used to initialize the storage provider.
Expand Down Expand Up @@ -76,11 +69,10 @@ def __init__(
self.tensors: Dict[str, Tensor] = {}

if dataset_exists(self.storage):
ds_meta = read_dataset_meta(self.storage)
for tensor_name in ds_meta["tensors"]:
for tensor_name in self.meta["tensors"]:
self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
else:
write_dataset_meta(self.storage, {"tensors": []})
self.meta = {"tensors": []}

def __len__(self):
"""Return the greatest length of tensors"""
Expand All @@ -98,40 +90,61 @@ def __getitem__(self, item: Union[str, int, slice, Index]):
else:
raise InvalidKeyTypeError(item)

def __setitem__(self, item: Union[slice, str], value):
if isinstance(item, str):
tensor_key = item

if tensor_exists(tensor_key, self.storage):
raise TensorAlreadyExistsError(tensor_key)

if isinstance(value, np.ndarray):
tensor_meta = tensor_meta_from_array(value, batched=True)
ds_meta = read_dataset_meta(self.storage)
ds_meta["tensors"].append(tensor_key)
write_dataset_meta(self.storage, ds_meta)
tensor = Tensor(tensor_key, self.storage, tensor_meta=tensor_meta)
self.tensors[tensor_key] = tensor
tensor.append(value, batched=True)
return tensor
else:
raise UnsupportedTensorTypeError(item)
else:
raise InvalidKeyTypeError(item)
def create_tensor(
self,
name: str,
htype: Optional[str] = None,
chunk_size: Optional[int] = None,
dtype: Optional[str] = None,
extra_meta: Optional[dict] = None,
):
"""Creates a new tensor in a dataset.
__getattr__ = __getitem__
Args:
name (str): The name of the tensor to be created.
htype (str, optional): The class of data for the tensor.
The defaults for other parameters are determined in terms of this value.
For example, `htype="image"` would have `dtype` default to `uint8`.
These defaults can be overridden by explicitly passing any of the other parameters to this function.
May also modify the defaults for other parameters.
chunk_size (int, optional): The target size for chunks in this tensor.
dtype (str, optional): The data type to use for this tensor.
Will be overwritten when the first sample is added.
extra_meta (dict, optional): Any additional metadata to be added to the tensor.
def __setattr__(self, name: str, value):
"""Set the named attribute on the dataset"""
if name in DATASET_RESERVED_ATTRIBUTES:
return super().__setattr__(name, value)
else:
return self.__setitem__(name, value)
Returns:
The new tensor, which can also be accessed by `self[name]`.
Raises:
TensorAlreadyExistsError: Duplicate tensors are not allowed.
"""
if tensor_exists(name, self.storage):
raise TensorAlreadyExistsError(name)

ds_meta = self.meta
ds_meta["tensors"].append(name)
self.meta = ds_meta

tensor_meta = default_tensor_meta(htype, chunk_size, dtype, extra_meta)
tensor = Tensor(name, self.storage, tensor_meta=tensor_meta)
self.tensors[name] = tensor

return tensor

__getattr__ = __getitem__

def __iter__(self):
for i in range(len(self)):
yield self[i]

@property
def meta(self):
return read_dataset_meta(self.storage)

@meta.setter
def meta(self, new_meta: dict):
write_dataset_meta(self.storage, new_meta)

def flush(self):
"""Necessary operation after writes if caches are being used.
Writes all the dirty data from the cache layers (if any) to the underlying storage.
Expand All @@ -156,10 +169,10 @@ def delete(self):

@staticmethod
def from_path(path: str):
"""Create a local hub dataset from unstructured data.
"""Creates a hub dataset from unstructured data.
Note:
This copies the data locally in hub format.
This copies the data into hub format.
Be careful when using this with large datasets.
Args:
Expand Down
72 changes: 45 additions & 27 deletions hub/api/tensor.py
@@ -1,4 +1,4 @@
from typing import Union
from typing import Union, Iterable
import warnings

import numpy as np
Expand All @@ -8,6 +8,7 @@
add_samples_to_tensor,
read_samples_from_tensor,
read_tensor_meta,
write_tensor_meta,
tensor_exists,
)
from hub.core.typing import StorageProvider
Expand All @@ -24,7 +25,7 @@ def __init__(
tensor_meta: dict = None,
index: Union[int, slice, Index] = None,
):
"""Initialize a new tensor.
"""Initializes a new tensor.
Note:
This operation does not create a new tensor in the storage provider,
Expand Down Expand Up @@ -56,54 +57,71 @@ def __init__(
raise TensorDoesNotExistError(self.key)
create_tensor(self.key, self.storage, tensor_meta)

def append(self, array: np.ndarray, batched: bool):
# TODO: split into `append`/`extend`
add_samples_to_tensor(
array,
self.key,
storage=self.storage,
batched=batched,
)
def extend(self, array: Union[np.ndarray, Iterable[np.ndarray]]):
"""Extends a tensor by appending multiple elements from an iterable.
Accepts an iterable of numpy arrays or a single batched numpy array.
Example:
>>> len(image)
0
>>> image.extend(np.zeros((100, 28, 28, 1)))
>>> len(image)
100
Args:
array: The data to add to the tensor.
The length should be equal to the number of samples to add.
"""
if isinstance(array, np.ndarray):
add_samples_to_tensor(array, self.key, storage=self.storage, batched=True)
else:
for sample in array:
self.append(sample)

def append(self, array: np.ndarray):
"""Appends a sample to the end of a tensor.
Example:
>>> len(image)
0
>>> image.append(np.zeros((28, 28, 1)))
>>> len(image)
1
Args:
array (np.ndarray): The data to add to the tensor.
"""
add_samples_to_tensor(array, self.key, storage=self.storage, batched=False)

@property
def meta(self):
return read_tensor_meta(self.key, self.storage)

@meta.setter
def meta(self, new_meta: dict):
write_tensor_meta(self.key, self.storage, new_meta)

@property
def shape(self):
# TODO: when dynamic arrays are supported, handle `min_shape != max_shape` (right now they're always equal)
return self.meta["max_shape"]

def __len__(self):
"""Return the length of the primary axis."""
"""Returns the length of the primary axis of a tensor."""
return self.meta["length"]

def __getitem__(self, item: Union[int, slice, Index]):
return Tensor(self.key, self.storage, index=self.index[item])

def __setitem__(self, item: Union[int, slice], value: np.ndarray):
sliced_self = self[item]
if sliced_self.index.item != slice(None):
raise NotImplementedError(
"Assignment to Tensor subsections not currently supported!"
)
else:
if tensor_exists(self.key, self.storage):
raise TensorAlreadyExistsError(self.key)

add_samples_to_tensor(
array=value,
key=self.key,
storage=self.storage,
batched=True,
)
raise NotImplementedError("Tensor update not currently supported!")

def __iter__(self):
for i in range(len(self)):
yield self[i]

def numpy(self):
"""Compute the contents of this tensor in numpy format.
"""Computes the contents of a tensor in numpy format.
Returns:
A numpy array containing the data represented by this tensor.
Expand Down
39 changes: 30 additions & 9 deletions hub/api/tests/test_api.py
Expand Up @@ -12,8 +12,10 @@ def test_persist_local_flush(local_storage):
pytest.skip()

ds = Dataset(local_storage.root, local_cache_size=512)
ds.image = np.ones((4, 4096, 4096))
ds.create_tensor("image")
ds.image.extend(np.ones((4, 4096, 4096)))
ds.flush()

ds_new = Dataset(local_storage.root)
assert len(ds_new) == 4
assert ds_new.image.shape == (4096, 4096)
Expand All @@ -26,7 +28,8 @@ def test_persist_local_clear_cache(local_storage):
pytest.skip()

ds = Dataset(local_storage.root, local_cache_size=512)
ds.image = np.ones((4, 4096, 4096))
ds.create_tensor("image")
ds.image.extend(np.ones((4, 4096, 4096)))
ds.clear_cache()
ds_new = Dataset(local_storage.root)
assert len(ds_new) == 4
Expand All @@ -37,21 +40,36 @@ def test_persist_local_clear_cache(local_storage):

@parametrize_all_dataset_storages
def test_populate_dataset(ds):
assert read_dataset_meta(ds.storage) == {"tensors": []}
ds.image = np.ones((4, 28, 28))
assert read_dataset_meta(ds.storage) == {"tensors": ["image"]}
assert ds.meta == {"tensors": []}
ds.create_tensor("image")
assert len(ds) == 0
assert len(ds.image) == 0

ds.image.extend(np.ones((4, 28, 28)))
assert len(ds) == 4
assert len(ds.image) == 4

for _ in range(10):
ds.image.append(np.ones((28, 28)))
assert len(ds.image) == 14

ds.image.extend([np.ones((28, 28)), np.ones((28, 28))])
assert len(ds.image) == 16

assert ds.meta == {"tensors": ["image"]}


@parametrize_all_dataset_storages
def test_compute_tensor(ds):
ds.image = np.ones((32, 28, 28))
ds.create_tensor("image")
ds.image.extend(np.ones((32, 28, 28)))
np.testing.assert_array_equal(ds.image.numpy(), np.ones((32, 28, 28)))


@parametrize_all_dataset_storages
def test_compute_tensor_slice(ds):
ds.image = np.vstack((np.arange(16),) * 8)
ds.create_tensor("image")
ds.image.extend(np.vstack((np.arange(16),) * 8))

sliced_data = ds.image[2:5].numpy()
expected_data = np.vstack((np.arange(16),) * 3)
Expand All @@ -61,8 +79,11 @@ def test_compute_tensor_slice(ds):
@parametrize_all_dataset_storages
def test_iterate_dataset(ds):
labels = [1, 9, 7, 4]
ds.image = np.ones((4, 28, 28))
ds.label = np.asarray(labels).reshape((4, 1))
ds.create_tensor("image")
ds.create_tensor("label")

ds.image.extend(np.ones((4, 28, 28)))
ds.label.extend(np.asarray(labels).reshape((4, 1)))

for idx, sub_ds in enumerate(ds):
img = sub_ds.image.numpy()
Expand Down
2 changes: 2 additions & 0 deletions hub/constants.py
Expand Up @@ -6,6 +6,8 @@
MB = 1000 * KB
GB = 1000 * MB

DEFAULT_DTYPE = "float64"

DEFAULT_CHUNK_SIZE = 16 * MB
MIN_FIRST_CACHE_SIZE = 32 * MB
MIN_SECOND_CACHE_SIZE = 160 * MB
Expand Down
6 changes: 2 additions & 4 deletions hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
Expand Up @@ -10,7 +10,7 @@
add_samples_to_tensor,
create_tensor,
)
from hub.core.meta.tensor_meta import tensor_meta_from_array
from hub.core.meta.tensor_meta import default_tensor_meta
from hub.core.tests.common import TENSOR_KEY
from hub.core.typing import StorageProvider
from hub.tests.common_benchmark import (
Expand All @@ -24,9 +24,7 @@
def single_benchmark_write(info, key, arrays, chunk_size, storage, batched):
actual_key = "%s_%i" % (key, info["iteration"])

create_tensor(
actual_key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size)
)
create_tensor(actual_key, storage, default_tensor_meta(chunk_size=chunk_size))

for a_in in arrays:
add_samples_to_tensor(
Expand Down
6 changes: 3 additions & 3 deletions hub/core/compression/webp.py
Expand Up @@ -46,9 +46,9 @@ def encode_single_image(self, image: np.ndarray) -> bytes:
Encoded data.
"""
with BytesIO() as buffer:
image = Image.fromarray(image)
image = image.convert("RGB")
image.save(buffer, format=self.codec_id, quality=self.quality)
img = Image.fromarray(image)
img = img.convert("RGB")
img.save(buffer, format=self.codec_id, quality=self.quality)
return buffer.getvalue()

def decode_single_image(self, buf: bytes, image_shape: tuple) -> np.ndarray:
Expand Down

0 comments on commit 92fe293

Please sign in to comment.