Merge pull request #658 from activeloopai/fixes/hub_improvements

Improvements to to_tensorflow and BBox, Classlabel schemas
activeloopai · Mar 8, 2021 · 853456a · 853456a
2 parents 2a66b25 + 6d1e6c9
commit 853456a
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 22 deletions.
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -748,7 +748,6 @@ def to_pytorch(
 
     def to_tensorflow(self, indexes=None, include_shapes=False):
         """| Converts the dataset into a tensorflow compatible format
-
         Parameters
         ----------
         indexes: list or int, optional

diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
@@ -250,7 +250,7 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
-    def to_tensorflow(self, include_shapes):
+    def to_tensorflow(self, include_shapes=False):
         """|Converts the dataset into a tensorflow compatible format
 
         Parameters
@@ -300,6 +300,10 @@ def flush(self) -> None:
         """Flush dataset"""
         self.dataset.flush()
 
+    def flush(self) -> None:
+        """Flush dataset"""
+        self.dataset.flush()
+
     def numpy(self, label_name=False):
         """Gets the value from different tensorview objects in the datasetview schema
 

diff --git a/hub/api/integrations.py b/hub/api/integrations.py
@@ -185,18 +185,26 @@ def _get_active_item(key, index):
         return _active_chunks[key][index % samples_per_chunk]
 
     def tf_gen():
+        key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys}
         for index in indexes:
             d = {}
             for key in dataset.keys:
-                split_key = key.split("/")
-                cur = d
+                split_key, cur = key.split("/"), d
                 for i in range(1, len(split_key) - 1):
                     if split_key[i] in cur.keys():
                         cur = cur[split_key[i]]
                     else:
                         cur[split_key[i]] = {}
                         cur = cur[split_key[i]]
                 cur[split_key[-1]] = _get_active_item(key, index)
+                if isinstance(key_dtype_map[key], Text):
+                    value = cur[split_key[-1]]
+                    cur[split_key[-1]] = (
+                        "".join(chr(it) for it in value.tolist())
+                        if value.ndim == 1
+                        else ["".join(chr(it) for it in val.tolist()) for val in value]
+                    )
+
             yield (d)
 
     def dict_to_tf(my_dtype):
@@ -208,9 +216,14 @@ def dict_to_tf(my_dtype):
     def tensor_to_tf(my_dtype):
         return dtype_to_tf(my_dtype.dtype)
 
+    def text_to_tf(my_dtype):
+        return "string"
+
     def dtype_to_tf(my_dtype):
         if isinstance(my_dtype, SchemaDict):
             return dict_to_tf(my_dtype)
+        elif isinstance(my_dtype, Text):
+            return text_to_tf(my_dtype)
         elif isinstance(my_dtype, Tensor):
             return tensor_to_tf(my_dtype)
         elif isinstance(my_dtype, Primitive):
@@ -221,10 +234,10 @@ def dtype_to_tf(my_dtype):
     def get_output_shapes(my_dtype):
         if isinstance(my_dtype, SchemaDict):
             return output_shapes_from_dict(my_dtype)
+        elif isinstance(my_dtype, (Text, Primitive)):
+            return ()
         elif isinstance(my_dtype, Tensor):
             return my_dtype.shape
-        elif isinstance(my_dtype, Primitive):
-            return ()
 
     def output_shapes_from_dict(my_dtype):
         d = {}

diff --git a/hub/api/tensorview.py b/hub/api/tensorview.py
@@ -98,9 +98,18 @@ def numpy(self, label_name=False):
 
         if isinstance(self.dtype, hub.schema.class_label.ClassLabel) and label_name:
             if isinstance(self.indexes, int):
-                value = self.dtype.int2str(value)
+                if value.ndim == 0:
+                    value = self.dtype.int2str(value)
+                elif value.ndim == 1:
+                    value = [self.dtype.int2str(value[i]) for i in range(value.size)]
             else:
-                value = [self.dtype.int2str(value[i]) for i in range(value.size)]
+                if value.ndim == 1:
+                    value = [self.dtype.int2str(value[i]) for i in range(value.size)]
+                elif value.ndim == 2:
+                    value = [
+                        [self.dtype.int2str(item[i]) for i in range(item.size)]
+                        for item in value
+                    ]
 
         if isinstance(self.dtype, hub.schema.text.Text):
             if self.dataset.tokenizer is not None:

diff --git a/hub/schema/bbox.py b/hub/schema/bbox.py
@@ -4,8 +4,7 @@
 If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 """
 
-# from typing import Tuple
-
+from typing import Tuple
 from hub.schema.features import Tensor
 
 
@@ -16,11 +15,26 @@ class BBox(Tensor):
     normalized coordinates of the bounding box `[xmin, ymin, xmax, ymax]`
     """
 
-    def __init__(self, dtype="float64", chunks=None, compressor="lz4"):
+    def __init__(
+        self,
+        shape: Tuple[int, ...] = (4,),
+        max_shape: Tuple[int, ...] = None,
+        dtype="float64",
+        chunks=None,
+        compressor="lz4",
+    ):
         """Construct the connector.
 
         Parameters
         ----------
+        shape: tuple of ints or None
+            The shape of bounding box.
+            Will be (4,) if only one bounding box corresponding to each sample.
+            If N bboxes corresponding to each sample, shape should be (N,)
+            If the number of bboxes for each sample vary from 0 to M. The shape should be set to (None, 4) and max_shape should be set to (M, 4)
+            Defaults to (4,).
+        max_shape : Tuple[int], optional
+            Maximum shape of BBox
         dtype : str
                 dtype of bbox coordinates. Default: 'float32'
         chunks : Tuple[int] | True
@@ -29,8 +43,13 @@ def __init__(self, dtype="float64", chunks=None, compressor="lz4"):
             Sample Count is also in the list of tensor's dimensions (first dimension)
             If default value is chosen, automatically detects how to split into chunks
         """
+        self.check_shape(shape)
         super(BBox, self).__init__(
-            shape=(4,), dtype=dtype, chunks=chunks, compressor=compressor
+            shape=shape,
+            max_shape=max_shape,
+            dtype=dtype,
+            chunks=chunks,
+            compressor=compressor,
         )
 
     def __str__(self):
@@ -40,3 +59,9 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+    def check_shape(self, shape):
+        if len(shape) not in [1, 2] or shape[-1] != 4:
+            raise ValueError(
+                "Wrong BBox shape provided, should be of the format (4,) or (None, 4) or (N, 4)"
+            )
diff --git a/hub/schema/class_label.py b/hub/schema/class_label.py
@@ -4,7 +4,7 @@
 If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 """
 
-from typing import List
+from typing import List, Tuple
 from hub.schema.features import Tensor
 
 
@@ -18,6 +18,8 @@ class ClassLabel(Tensor):
 
     def __init__(
         self,
+        shape: Tuple[int, ...] = (),
+        max_shape: Tuple[int, ...] = None,
         num_classes: int = None,
         names: List[str] = None,
         names_file: str = None,
@@ -40,15 +42,21 @@ def __init__(
 
         Parameters
         ----------
+        shape: tuple of ints or None
+            The shape of classlabel.
+            Will be () if only one classbabel corresponding to each sample.
+            If N classlabels corresponding to each sample, shape should be (N,)
+            If the number of classlabels for each sample vary from 0 to M. The shape should be set to (None,) and max_shape should be set to (M,)
+            Defaults to ().
+        max_shape : Tuple[int], optional
+            Maximum shape of ClassLabel
         num_classes: `int`
             number of classes. All labels must be < num_classes.
         names: `list<str>`
             string names for the integer classes. The order in which the names are provided is kept.
         names_file: `str`
             path to a file with names for the integer classes, one per line.
-        max_shape : Tuple[int]
-            Maximum shape of tensor shape if tensor is dynamic
-        chunks : Tuple[int] | True
+        chunks : Tuple[int] | True, optional
             Describes how to split tensor dimensions into chunks (files) to store them efficiently.
             It is anticipated that each file should be ~16MB.
             Sample Count is also in the list of tensor's dimensions (first dimension)
@@ -61,9 +69,11 @@ def __init__(
         ----------
         ValueError: If more than one argument is provided
         """
+        self.check_shape(shape)
         super().__init__(
-            shape=(),
-            dtype="int64",
+            shape=shape,
+            max_shape=max_shape,
+            dtype="uint16",
             chunks=chunks,
             compressor=compressor,
         )
@@ -158,3 +168,9 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+    def check_shape(self, shape):
+        if len(shape) not in [0, 1]:
+            raise ValueError(
+                "Wrong ClassLabel shape provided, should be of the format () or (None,) or (N,)"
+            )
diff --git a/hub/schema/deserialize.py b/hub/schema/deserialize.py
@@ -35,22 +35,28 @@ def deserialize(inp):
             )
         elif inp["type"] == "BBox":
             return BBox(
+                shape=tuple(inp["shape"]),
                 dtype=deserialize(inp["dtype"]),
                 chunks=inp["chunks"],
                 compressor=_get_compressor(inp),
+                max_shape=tuple(inp["max_shape"]),
             )
         elif inp["type"] == "ClassLabel":
             if inp["_names"] is not None:
                 return ClassLabel(
+                    shape=tuple(inp["shape"]),
                     names=inp["_names"],
                     chunks=inp["chunks"],
                     compressor=_get_compressor(inp),
+                    max_shape=tuple(inp["max_shape"]),
                 )
             else:
                 return ClassLabel(
+                    shape=tuple(inp["shape"]),
                     num_classes=inp["_num_classes"],
                     chunks=inp["chunks"],
                     compressor=_get_compressor(inp),
+                    max_shape=tuple(inp["max_shape"]),
                 )
         elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict":
             d = {}

diff --git a/hub/schema/sequence.py b/hub/schema/sequence.py
@@ -12,12 +12,12 @@ class Sequence(Tensor):
     At generation time, a list for each of the sequence element is given. The output
     of `Dataset` will batch all the elements of the sequence together.
     If the length of the sequence is static and known in advance, it should be
-    specified in the constructor using the `length` param.
+    specified in the constructor using the `shape` param.
 
     | Usage:
     ----------
 
-    >>> sequence = Sequence(Image(), length=NB_FRAME)
+    >>> sequence = Sequence(shape=(5,), dtype = Image((100, 100, 3)))
     """
 
     def __init__(

diff --git a/hub/schema/tests/test_features.py b/hub/schema/tests/test_features.py
@@ -15,6 +15,8 @@
 from hub.schema.class_label import ClassLabel, _load_names_from_file
 from hub.schema.features import HubSchema, SchemaDict, Tensor
 import pytest
+from hub import Dataset
+import numpy as np
 
 
 def test_hub_feature_flatten():
@@ -94,13 +96,87 @@ def test_class_label():
         cl2.names = ["ab", "cd", "ef", "gh"]
 
 
+def test_class_label_2():
+    cl1 = ClassLabel(names=["apple", "banana", "cat"])
+    cl2 = ClassLabel((None,), (10,), names=["apple", "banana", "cat"])
+    cl3 = ClassLabel((3,), names=["apple", "banana", "cat"])
+    my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}
+
+    ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")
+
+    ds["cl1", 0] = cl1.str2int("cat")
+    ds["cl1", 1] = cl1.str2int("apple")
+    ds["cl1", 2] = cl1.str2int("apple")
+    ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
+    assert ds["cl1", 1].compute(True) == "apple"
+    assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
+    assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]
+
+    ds["cl2", 0] = np.array(
+        [cl2.str2int("cat"), cl2.str2int("cat"), cl2.str2int("apple")]
+    )
+    ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
+    ds["cl2", 2] = np.array(
+        [
+            cl2.str2int("cat"),
+            cl2.str2int("apple"),
+            cl2.str2int("banana"),
+            cl2.str2int("apple"),
+            cl2.str2int("banana"),
+        ]
+    )
+    ds["cl2", 3] = np.array([cl2.str2int("cat")])
+    assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
+    assert ds["cl2", 1].compute(True) == ["apple", "banana"]
+    assert ds["cl2", 2].compute(True) == ["cat", "apple", "banana", "apple", "banana"]
+    assert ds["cl2", 3].compute(True) == ["cat"]
+
+    ds["cl3", 0] = np.array(
+        [cl3.str2int("apple"), cl3.str2int("apple"), cl3.str2int("apple")]
+    )
+    ds["cl3", 1] = np.array(
+        [cl3.str2int("banana"), cl3.str2int("banana"), cl3.str2int("banana")]
+    )
+    ds["cl3", 2] = np.array(
+        [cl3.str2int("cat"), cl3.str2int("cat"), cl3.str2int("cat")]
+    )
+    assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
+    assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
+    assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
+    assert ds["cl3", 0:3].compute(True) == [
+        ["apple", "apple", "apple"],
+        ["banana", "banana", "banana"],
+        ["cat", "cat", "cat"],
+    ]
+
+
 def test_polygon():
     with pytest.raises(ValueError):
         poly1 = Polygon(shape=(11, 3))
     with pytest.raises(ValueError):
         poly2 = Polygon(shape=(11, 4, 2))
 
 
+def test_bbox_shape():
+    with pytest.raises(ValueError):
+        bb1 = BBox(shape=(11, 3))
+    with pytest.raises(ValueError):
+        bb2 = BBox(shape=(11, 4, 2))
+    bb3 = BBox(shape=(None, 4), max_shape=(10, 4))
+    bb4 = BBox(shape=(4,))
+    bb4 = BBox(shape=(5, 4))
+
+
+def test_classlabel_shape():
+    with pytest.raises(ValueError):
+        cl1 = ClassLabel(shape=(11, 3))
+    with pytest.raises(ValueError):
+        cl2 = ClassLabel(shape=(11, 4, 2))
+    cl3 = ClassLabel(shape=(None,), max_shape=(10,))
+    cl4 = ClassLabel()
+    cl4 = ClassLabel(shape=(5,))
+
+
 test_image_inputs = [
     "uint32",
     "int16",
@@ -134,8 +210,8 @@ def test_classlabel_repr():
     cl1 = ClassLabel(num_classes=5)
     cl2 = ClassLabel(names=["apple", "orange", "banana"])
 
-    text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)"
-    text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)"
+    text1 = "ClassLabel(shape=(), dtype='uint16', num_classes=5)"
+    text2 = "ClassLabel(shape=(), dtype='uint16', names=['apple', 'orange', 'banana'], num_classes=3)"
     assert cl1.__repr__() == text1
     assert cl2.__repr__() == text2