Remove default image dtype (#2659)

Remove default image dtype
activeloopai · Oct 17, 2023 · efbc684 · efbc684
1 parent c00b49f
commit efbc684
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 15 deletions.
diff --git a/deeplake/api/tests/test_api.py b/deeplake/api/tests/test_api.py
@@ -2680,12 +2680,12 @@ def test_tensor_dtype_bug(local_path):
         ds.abc.append(deeplake.link(f"{data_path}/standard.nii.gz"))
 
     assert ds.abc[0].numpy().shape == (4, 5, 7)
-    assert ds.abc.dtype == np.dtype("<U1")
+    assert ds.abc.dtype == np.dtype("uint8")
 
     ds2 = ds.copy(f"{local_path}_2", overwrite=True)
 
     assert ds2.abc[0].numpy().shape == (4, 5, 7)
-    assert ds2.abc.dtype == np.dtype("<U1")
+    assert ds2.abc.dtype == np.dtype("uint8")
 
 
 def test_iterate_with_groups(memory_ds):
@@ -3043,3 +3043,12 @@ def test_change_htype_fail(local_ds_generator):
         ds.create_tensor("images3", htype="image", sample_compression="jpg")
         with pytest.raises(UnsupportedCompressionError):
             ds.images3.htype = "embedding"
+
+
+def test_append_non_uint8_to_image(local_ds):
+    with local_ds as ds:
+        ds.create_tensor("images", htype="image", sample_compression="png")
+        ds.images.append(np.zeros((40, 40, 1), dtype=np.int16))
+        ds.images.append(np.zeros((40, 40, 1), dtype=np.uint8))
+
+    assert ds.images.dtype.name == "int16"
diff --git a/deeplake/api/tests/test_api_with_compression.py b/deeplake/api/tests/test_api_with_compression.py
@@ -43,7 +43,7 @@ def test_populate_compressed_samples(local_ds, cat_path, flower_path):
         tiling_threshold=1 * MB,
     )
 
-    assert images.meta.dtype == "uint8"
+    assert images.meta.dtype == None
     assert images.meta.sample_compression == "png"
 
     _populate_compressed_samples(images, cat_path, flower_path)
@@ -73,7 +73,7 @@ def test_populate_compressed_samples(local_ds, cat_path, flower_path):
 def test_iterate_compressed_samples(local_ds, cat_path, flower_path):
     images = local_ds.create_tensor(TENSOR_KEY, htype="image", sample_compression="png")
 
-    assert images.meta.dtype == "uint8"
+    assert images.meta.dtype == None
     assert images.meta.sample_compression == "png"
 
     _populate_compressed_samples(images, cat_path, flower_path)

diff --git a/deeplake/core/chunk_engine.py b/deeplake/core/chunk_engine.py
@@ -15,7 +15,7 @@
 )
 from deeplake.api.info import Info
 from deeplake.core.link_creds import LinkCreds
-from deeplake.core.linked_sample import LinkedSample
+from deeplake.core.linked_sample import LinkedSample, read_linked_sample
 from deeplake.core.meta.encode.base_encoder import LAST_SEEN_INDEX_COLUMN
 from deeplake.core.serialize import HEADER_SIZE_BYTES, text_to_bytes
 from deeplake.core.tensor_link import (
@@ -692,9 +692,33 @@ def _sanitize_samples(
         if tensor_meta.htype is None and not all_empty:
             tensor_meta.set_htype(get_htype(samples))
         if tensor_meta.dtype is None and not all_empty:
-            tensor_meta.set_dtype(
-                get_dtype(next(filter(lambda x: x is not None, samples)))
-            )  # first non empty sample
+            if tensor_meta.is_link:
+                try:
+                    # download one sample to get dtype
+                    sample = next(filter(lambda x: x is not None, samples))
+                    assert isinstance(
+                        sample, LinkedSample
+                    ), "Sample must be LinkedSample"
+                    dtype = np.dtype(
+                        read_linked_sample(
+                            sample.path, sample.creds_key, self.link_creds, True
+                        )._typestr
+                    )
+                except:
+                    # assume uint8 if download fails
+                    dtype = np.dtype("uint8")
+            else:
+                non_empty_samples = list(filter(lambda x: x is not None, samples))
+                for sample in non_empty_samples:
+                    try:
+                        dtype = get_dtype(sample)
+                        break
+                    except:
+                        pass
+                else:
+                    if not ignore_errors:
+                        raise ValueError("Could not determine dtype of samples")
+            tensor_meta.set_dtype(dtype)
         if self._convert_to_list(samples):
             samples = list(samples)
         if self._is_temp_label_tensor:

diff --git a/deeplake/core/transform/transform_dataset.py b/deeplake/core/transform/transform_dataset.py
@@ -161,7 +161,7 @@ def item_added(self, item, tensor):
                 # First sample in tensor
                 # Flush to set meta attributes
                 if dtype is None:
-                    self.flush()
+                    self.flush(clear_on_fail=False)
                     return
                 sizeof_item = self._calculate_sample_size(item, dtype, htype)
             except:
@@ -201,22 +201,29 @@ def _flush_tensor_to_chunk_engine(
         updated_tensors[full_name] = len(items)
         tensor.items.clear()
 
-    def _rollback(self, updated_tensors):
+    def _rollback(self, updated_tensors, no_dtype_tensors):
         for t in updated_tensors:
             chunk_engine = self.all_chunk_engines[t]
             num_samples = updated_tensors[t]
             for _ in range(num_samples):
                 chunk_engine.pop(link_callback=chunk_engine._transform_pop_callback)
 
+            if t in no_dtype_tensors:
+                meta = chunk_engine.tensor_meta
+                meta.dtype = None
+                meta.typestr = None
+                meta.is_dirty = True
+
     def _clear(self):
         for tensor in self.data.values():
             tensor.items.clear()
         self.cache_used = 0
 
-    def flush(self):
+    def flush(self, clear_on_fail=True):
         all_chunk_engines = self.all_chunk_engines
         label_temp_tensors = self.label_temp_tensors
         updated_tensors = {}
+        no_dtype_tensors = []
         try:
             for name, tensor in self.data.items():
                 if not tensor.is_group:
@@ -225,6 +232,11 @@ def flush(self):
                     chunk_engine = all_chunk_engines[name]
                     callback = chunk_engine._transform_callback
 
+                    meta = chunk_engine.tensor_meta
+                    if meta.length == 0 and meta.dtype is None:
+                        # for rolling back dtype change
+                        no_dtype_tensors.append(name)
+
                     if tensor.numpy_only:
                         self._flush_numpy_tensor_to_chunk_engine(
                             name, tensor, chunk_engine, callback, updated_tensors
@@ -234,9 +246,10 @@ def flush(self):
                             name, tensor, chunk_engine, callback, updated_tensors
                         )
             self.start_input_idx = None
+            self._clear()
         except Exception as e:
-            self._rollback(updated_tensors)
+            self._rollback(updated_tensors, no_dtype_tensors)
+            if clear_on_fail:
+                self._clear()
             e = e.__cause__ if isinstance(e, SampleAppendError) else e  # type: ignore
             raise SampleAppendError(name) from e
-        finally:
-            self._clear()
diff --git a/deeplake/htype.py b/deeplake/htype.py
@@ -50,7 +50,7 @@ class htype:
 HTYPE_CONFIGURATIONS: Dict[str, Dict] = {
     htype.DEFAULT: {"dtype": None},
     htype.IMAGE: {
-        "dtype": "uint8",
+        "dtype": None,
         "intrinsics": None,
         "_info": ["intrinsics"],
     },