Skip to content

Commit

Permalink
Merge pull request #658 from activeloopai/fixes/hub_improvements
Browse files Browse the repository at this point in the history
Improvements to to_tensorflow and BBox, Classlabel schemas
  • Loading branch information
AbhinavTuli committed Mar 8, 2021
2 parents 2a66b25 + 6d1e6c9 commit 853456a
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 22 deletions.
1 change: 0 additions & 1 deletion hub/api/dataset.py
Expand Up @@ -748,7 +748,6 @@ def to_pytorch(

def to_tensorflow(self, indexes=None, include_shapes=False):
"""| Converts the dataset into a tensorflow compatible format
Parameters
----------
indexes: list or int, optional
Expand Down
6 changes: 5 additions & 1 deletion hub/api/datasetview.py
Expand Up @@ -250,7 +250,7 @@ def __str__(self):
def __repr__(self):
return self.__str__()

def to_tensorflow(self, include_shapes):
def to_tensorflow(self, include_shapes=False):
"""|Converts the dataset into a tensorflow compatible format
Parameters
Expand Down Expand Up @@ -300,6 +300,10 @@ def flush(self) -> None:
"""Flush dataset"""
self.dataset.flush()

def flush(self) -> None:
"""Flush dataset"""
self.dataset.flush()

def numpy(self, label_name=False):
"""Gets the value from different tensorview objects in the datasetview schema
Expand Down
21 changes: 17 additions & 4 deletions hub/api/integrations.py
Expand Up @@ -185,18 +185,26 @@ def _get_active_item(key, index):
return _active_chunks[key][index % samples_per_chunk]

def tf_gen():
key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys}
for index in indexes:
d = {}
for key in dataset.keys:
split_key = key.split("/")
cur = d
split_key, cur = key.split("/"), d
for i in range(1, len(split_key) - 1):
if split_key[i] in cur.keys():
cur = cur[split_key[i]]
else:
cur[split_key[i]] = {}
cur = cur[split_key[i]]
cur[split_key[-1]] = _get_active_item(key, index)
if isinstance(key_dtype_map[key], Text):
value = cur[split_key[-1]]
cur[split_key[-1]] = (
"".join(chr(it) for it in value.tolist())
if value.ndim == 1
else ["".join(chr(it) for it in val.tolist()) for val in value]
)

yield (d)

def dict_to_tf(my_dtype):
Expand All @@ -208,9 +216,14 @@ def dict_to_tf(my_dtype):
def tensor_to_tf(my_dtype):
return dtype_to_tf(my_dtype.dtype)

def text_to_tf(my_dtype):
return "string"

def dtype_to_tf(my_dtype):
if isinstance(my_dtype, SchemaDict):
return dict_to_tf(my_dtype)
elif isinstance(my_dtype, Text):
return text_to_tf(my_dtype)
elif isinstance(my_dtype, Tensor):
return tensor_to_tf(my_dtype)
elif isinstance(my_dtype, Primitive):
Expand All @@ -221,10 +234,10 @@ def dtype_to_tf(my_dtype):
def get_output_shapes(my_dtype):
if isinstance(my_dtype, SchemaDict):
return output_shapes_from_dict(my_dtype)
elif isinstance(my_dtype, (Text, Primitive)):
return ()
elif isinstance(my_dtype, Tensor):
return my_dtype.shape
elif isinstance(my_dtype, Primitive):
return ()

def output_shapes_from_dict(my_dtype):
d = {}
Expand Down
13 changes: 11 additions & 2 deletions hub/api/tensorview.py
Expand Up @@ -98,9 +98,18 @@ def numpy(self, label_name=False):

if isinstance(self.dtype, hub.schema.class_label.ClassLabel) and label_name:
if isinstance(self.indexes, int):
value = self.dtype.int2str(value)
if value.ndim == 0:
value = self.dtype.int2str(value)
elif value.ndim == 1:
value = [self.dtype.int2str(value[i]) for i in range(value.size)]
else:
value = [self.dtype.int2str(value[i]) for i in range(value.size)]
if value.ndim == 1:
value = [self.dtype.int2str(value[i]) for i in range(value.size)]
elif value.ndim == 2:
value = [
[self.dtype.int2str(item[i]) for i in range(item.size)]
for item in value
]

if isinstance(self.dtype, hub.schema.text.Text):
if self.dataset.tokenizer is not None:
Expand Down
33 changes: 29 additions & 4 deletions hub/schema/bbox.py
Expand Up @@ -4,8 +4,7 @@
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
"""

# from typing import Tuple

from typing import Tuple
from hub.schema.features import Tensor


Expand All @@ -16,11 +15,26 @@ class BBox(Tensor):
normalized coordinates of the bounding box `[xmin, ymin, xmax, ymax]`
"""

def __init__(self, dtype="float64", chunks=None, compressor="lz4"):
def __init__(
self,
shape: Tuple[int, ...] = (4,),
max_shape: Tuple[int, ...] = None,
dtype="float64",
chunks=None,
compressor="lz4",
):
"""Construct the connector.
Parameters
----------
shape: tuple of ints or None
The shape of bounding box.
Will be (4,) if only one bounding box corresponding to each sample.
If N bboxes corresponding to each sample, shape should be (N,)
If the number of bboxes for each sample vary from 0 to M. The shape should be set to (None, 4) and max_shape should be set to (M, 4)
Defaults to (4,).
max_shape : Tuple[int], optional
Maximum shape of BBox
dtype : str
dtype of bbox coordinates. Default: 'float32'
chunks : Tuple[int] | True
Expand All @@ -29,8 +43,13 @@ def __init__(self, dtype="float64", chunks=None, compressor="lz4"):
Sample Count is also in the list of tensor's dimensions (first dimension)
If default value is chosen, automatically detects how to split into chunks
"""
self.check_shape(shape)
super(BBox, self).__init__(
shape=(4,), dtype=dtype, chunks=chunks, compressor=compressor
shape=shape,
max_shape=max_shape,
dtype=dtype,
chunks=chunks,
compressor=compressor,
)

def __str__(self):
Expand All @@ -40,3 +59,9 @@ def __str__(self):

def __repr__(self):
return self.__str__()

def check_shape(self, shape):
if len(shape) not in [1, 2] or shape[-1] != 4:
raise ValueError(
"Wrong BBox shape provided, should be of the format (4,) or (None, 4) or (N, 4)"
)
28 changes: 22 additions & 6 deletions hub/schema/class_label.py
Expand Up @@ -4,7 +4,7 @@
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
"""

from typing import List
from typing import List, Tuple
from hub.schema.features import Tensor


Expand All @@ -18,6 +18,8 @@ class ClassLabel(Tensor):

def __init__(
self,
shape: Tuple[int, ...] = (),
max_shape: Tuple[int, ...] = None,
num_classes: int = None,
names: List[str] = None,
names_file: str = None,
Expand All @@ -40,15 +42,21 @@ def __init__(
Parameters
----------
shape: tuple of ints or None
The shape of classlabel.
Will be () if only one classbabel corresponding to each sample.
If N classlabels corresponding to each sample, shape should be (N,)
If the number of classlabels for each sample vary from 0 to M. The shape should be set to (None,) and max_shape should be set to (M,)
Defaults to ().
max_shape : Tuple[int], optional
Maximum shape of ClassLabel
num_classes: `int`
number of classes. All labels must be < num_classes.
names: `list<str>`
string names for the integer classes. The order in which the names are provided is kept.
names_file: `str`
path to a file with names for the integer classes, one per line.
max_shape : Tuple[int]
Maximum shape of tensor shape if tensor is dynamic
chunks : Tuple[int] | True
chunks : Tuple[int] | True, optional
Describes how to split tensor dimensions into chunks (files) to store them efficiently.
It is anticipated that each file should be ~16MB.
Sample Count is also in the list of tensor's dimensions (first dimension)
Expand All @@ -61,9 +69,11 @@ def __init__(
----------
ValueError: If more than one argument is provided
"""
self.check_shape(shape)
super().__init__(
shape=(),
dtype="int64",
shape=shape,
max_shape=max_shape,
dtype="uint16",
chunks=chunks,
compressor=compressor,
)
Expand Down Expand Up @@ -158,3 +168,9 @@ def __str__(self):

def __repr__(self):
return self.__str__()

def check_shape(self, shape):
if len(shape) not in [0, 1]:
raise ValueError(
"Wrong ClassLabel shape provided, should be of the format () or (None,) or (N,)"
)
6 changes: 6 additions & 0 deletions hub/schema/deserialize.py
Expand Up @@ -35,22 +35,28 @@ def deserialize(inp):
)
elif inp["type"] == "BBox":
return BBox(
shape=tuple(inp["shape"]),
dtype=deserialize(inp["dtype"]),
chunks=inp["chunks"],
compressor=_get_compressor(inp),
max_shape=tuple(inp["max_shape"]),
)
elif inp["type"] == "ClassLabel":
if inp["_names"] is not None:
return ClassLabel(
shape=tuple(inp["shape"]),
names=inp["_names"],
chunks=inp["chunks"],
compressor=_get_compressor(inp),
max_shape=tuple(inp["max_shape"]),
)
else:
return ClassLabel(
shape=tuple(inp["shape"]),
num_classes=inp["_num_classes"],
chunks=inp["chunks"],
compressor=_get_compressor(inp),
max_shape=tuple(inp["max_shape"]),
)
elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict":
d = {}
Expand Down
4 changes: 2 additions & 2 deletions hub/schema/sequence.py
Expand Up @@ -12,12 +12,12 @@ class Sequence(Tensor):
At generation time, a list for each of the sequence element is given. The output
of `Dataset` will batch all the elements of the sequence together.
If the length of the sequence is static and known in advance, it should be
specified in the constructor using the `length` param.
specified in the constructor using the `shape` param.
| Usage:
----------
>>> sequence = Sequence(Image(), length=NB_FRAME)
>>> sequence = Sequence(shape=(5,), dtype = Image((100, 100, 3)))
"""

def __init__(
Expand Down
80 changes: 78 additions & 2 deletions hub/schema/tests/test_features.py
Expand Up @@ -15,6 +15,8 @@
from hub.schema.class_label import ClassLabel, _load_names_from_file
from hub.schema.features import HubSchema, SchemaDict, Tensor
import pytest
from hub import Dataset
import numpy as np


def test_hub_feature_flatten():
Expand Down Expand Up @@ -94,13 +96,87 @@ def test_class_label():
cl2.names = ["ab", "cd", "ef", "gh"]


def test_class_label_2():
cl1 = ClassLabel(names=["apple", "banana", "cat"])
cl2 = ClassLabel((None,), (10,), names=["apple", "banana", "cat"])
cl3 = ClassLabel((3,), names=["apple", "banana", "cat"])
my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}

ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")

ds["cl1", 0] = cl1.str2int("cat")
ds["cl1", 1] = cl1.str2int("apple")
ds["cl1", 2] = cl1.str2int("apple")
ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
assert ds["cl1", 1].compute(True) == "apple"
assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]

ds["cl2", 0] = np.array(
[cl2.str2int("cat"), cl2.str2int("cat"), cl2.str2int("apple")]
)
ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
ds["cl2", 2] = np.array(
[
cl2.str2int("cat"),
cl2.str2int("apple"),
cl2.str2int("banana"),
cl2.str2int("apple"),
cl2.str2int("banana"),
]
)
ds["cl2", 3] = np.array([cl2.str2int("cat")])
assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
assert ds["cl2", 1].compute(True) == ["apple", "banana"]
assert ds["cl2", 2].compute(True) == ["cat", "apple", "banana", "apple", "banana"]
assert ds["cl2", 3].compute(True) == ["cat"]

ds["cl3", 0] = np.array(
[cl3.str2int("apple"), cl3.str2int("apple"), cl3.str2int("apple")]
)
ds["cl3", 1] = np.array(
[cl3.str2int("banana"), cl3.str2int("banana"), cl3.str2int("banana")]
)
ds["cl3", 2] = np.array(
[cl3.str2int("cat"), cl3.str2int("cat"), cl3.str2int("cat")]
)
assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
assert ds["cl3", 0:3].compute(True) == [
["apple", "apple", "apple"],
["banana", "banana", "banana"],
["cat", "cat", "cat"],
]


def test_polygon():
with pytest.raises(ValueError):
poly1 = Polygon(shape=(11, 3))
with pytest.raises(ValueError):
poly2 = Polygon(shape=(11, 4, 2))


def test_bbox_shape():
with pytest.raises(ValueError):
bb1 = BBox(shape=(11, 3))
with pytest.raises(ValueError):
bb2 = BBox(shape=(11, 4, 2))
bb3 = BBox(shape=(None, 4), max_shape=(10, 4))
bb4 = BBox(shape=(4,))
bb4 = BBox(shape=(5, 4))


def test_classlabel_shape():
with pytest.raises(ValueError):
cl1 = ClassLabel(shape=(11, 3))
with pytest.raises(ValueError):
cl2 = ClassLabel(shape=(11, 4, 2))
cl3 = ClassLabel(shape=(None,), max_shape=(10,))
cl4 = ClassLabel()
cl4 = ClassLabel(shape=(5,))


test_image_inputs = [
"uint32",
"int16",
Expand Down Expand Up @@ -134,8 +210,8 @@ def test_classlabel_repr():
cl1 = ClassLabel(num_classes=5)
cl2 = ClassLabel(names=["apple", "orange", "banana"])

text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)"
text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)"
text1 = "ClassLabel(shape=(), dtype='uint16', num_classes=5)"
text2 = "ClassLabel(shape=(), dtype='uint16', names=['apple', 'orange', 'banana'], num_classes=3)"
assert cl1.__repr__() == text1
assert cl2.__repr__() == text2

Expand Down

0 comments on commit 853456a

Please sign in to comment.