Skip to content

Commit

Permalink
Remove default image dtype (#2659)
Browse files Browse the repository at this point in the history
Remove default image dtype
  • Loading branch information
FayazRahman committed Oct 17, 2023
1 parent c00b49f commit efbc684
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 15 deletions.
13 changes: 11 additions & 2 deletions deeplake/api/tests/test_api.py
Expand Up @@ -2680,12 +2680,12 @@ def test_tensor_dtype_bug(local_path):
ds.abc.append(deeplake.link(f"{data_path}/standard.nii.gz"))

assert ds.abc[0].numpy().shape == (4, 5, 7)
assert ds.abc.dtype == np.dtype("<U1")
assert ds.abc.dtype == np.dtype("uint8")

ds2 = ds.copy(f"{local_path}_2", overwrite=True)

assert ds2.abc[0].numpy().shape == (4, 5, 7)
assert ds2.abc.dtype == np.dtype("<U1")
assert ds2.abc.dtype == np.dtype("uint8")


def test_iterate_with_groups(memory_ds):
Expand Down Expand Up @@ -3043,3 +3043,12 @@ def test_change_htype_fail(local_ds_generator):
ds.create_tensor("images3", htype="image", sample_compression="jpg")
with pytest.raises(UnsupportedCompressionError):
ds.images3.htype = "embedding"


def test_append_non_uint8_to_image(local_ds):
with local_ds as ds:
ds.create_tensor("images", htype="image", sample_compression="png")
ds.images.append(np.zeros((40, 40, 1), dtype=np.int16))
ds.images.append(np.zeros((40, 40, 1), dtype=np.uint8))

assert ds.images.dtype.name == "int16"
4 changes: 2 additions & 2 deletions deeplake/api/tests/test_api_with_compression.py
Expand Up @@ -43,7 +43,7 @@ def test_populate_compressed_samples(local_ds, cat_path, flower_path):
tiling_threshold=1 * MB,
)

assert images.meta.dtype == "uint8"
assert images.meta.dtype == None
assert images.meta.sample_compression == "png"

_populate_compressed_samples(images, cat_path, flower_path)
Expand Down Expand Up @@ -73,7 +73,7 @@ def test_populate_compressed_samples(local_ds, cat_path, flower_path):
def test_iterate_compressed_samples(local_ds, cat_path, flower_path):
images = local_ds.create_tensor(TENSOR_KEY, htype="image", sample_compression="png")

assert images.meta.dtype == "uint8"
assert images.meta.dtype == None
assert images.meta.sample_compression == "png"

_populate_compressed_samples(images, cat_path, flower_path)
Expand Down
32 changes: 28 additions & 4 deletions deeplake/core/chunk_engine.py
Expand Up @@ -15,7 +15,7 @@
)
from deeplake.api.info import Info
from deeplake.core.link_creds import LinkCreds
from deeplake.core.linked_sample import LinkedSample
from deeplake.core.linked_sample import LinkedSample, read_linked_sample
from deeplake.core.meta.encode.base_encoder import LAST_SEEN_INDEX_COLUMN
from deeplake.core.serialize import HEADER_SIZE_BYTES, text_to_bytes
from deeplake.core.tensor_link import (
Expand Down Expand Up @@ -692,9 +692,33 @@ def _sanitize_samples(
if tensor_meta.htype is None and not all_empty:
tensor_meta.set_htype(get_htype(samples))
if tensor_meta.dtype is None and not all_empty:
tensor_meta.set_dtype(
get_dtype(next(filter(lambda x: x is not None, samples)))
) # first non empty sample
if tensor_meta.is_link:
try:
# download one sample to get dtype
sample = next(filter(lambda x: x is not None, samples))
assert isinstance(
sample, LinkedSample
), "Sample must be LinkedSample"
dtype = np.dtype(
read_linked_sample(
sample.path, sample.creds_key, self.link_creds, True
)._typestr
)
except:
# assume uint8 if download fails
dtype = np.dtype("uint8")
else:
non_empty_samples = list(filter(lambda x: x is not None, samples))
for sample in non_empty_samples:
try:
dtype = get_dtype(sample)
break
except:
pass
else:
if not ignore_errors:
raise ValueError("Could not determine dtype of samples")
tensor_meta.set_dtype(dtype)
if self._convert_to_list(samples):
samples = list(samples)
if self._is_temp_label_tensor:
Expand Down
25 changes: 19 additions & 6 deletions deeplake/core/transform/transform_dataset.py
Expand Up @@ -161,7 +161,7 @@ def item_added(self, item, tensor):
# First sample in tensor
# Flush to set meta attributes
if dtype is None:
self.flush()
self.flush(clear_on_fail=False)
return
sizeof_item = self._calculate_sample_size(item, dtype, htype)
except:
Expand Down Expand Up @@ -201,22 +201,29 @@ def _flush_tensor_to_chunk_engine(
updated_tensors[full_name] = len(items)
tensor.items.clear()

def _rollback(self, updated_tensors):
def _rollback(self, updated_tensors, no_dtype_tensors):
for t in updated_tensors:
chunk_engine = self.all_chunk_engines[t]
num_samples = updated_tensors[t]
for _ in range(num_samples):
chunk_engine.pop(link_callback=chunk_engine._transform_pop_callback)

if t in no_dtype_tensors:
meta = chunk_engine.tensor_meta
meta.dtype = None
meta.typestr = None
meta.is_dirty = True

def _clear(self):
for tensor in self.data.values():
tensor.items.clear()
self.cache_used = 0

def flush(self):
def flush(self, clear_on_fail=True):
all_chunk_engines = self.all_chunk_engines
label_temp_tensors = self.label_temp_tensors
updated_tensors = {}
no_dtype_tensors = []
try:
for name, tensor in self.data.items():
if not tensor.is_group:
Expand All @@ -225,6 +232,11 @@ def flush(self):
chunk_engine = all_chunk_engines[name]
callback = chunk_engine._transform_callback

meta = chunk_engine.tensor_meta
if meta.length == 0 and meta.dtype is None:
# for rolling back dtype change
no_dtype_tensors.append(name)

if tensor.numpy_only:
self._flush_numpy_tensor_to_chunk_engine(
name, tensor, chunk_engine, callback, updated_tensors
Expand All @@ -234,9 +246,10 @@ def flush(self):
name, tensor, chunk_engine, callback, updated_tensors
)
self.start_input_idx = None
self._clear()
except Exception as e:
self._rollback(updated_tensors)
self._rollback(updated_tensors, no_dtype_tensors)
if clear_on_fail:
self._clear()
e = e.__cause__ if isinstance(e, SampleAppendError) else e # type: ignore
raise SampleAppendError(name) from e
finally:
self._clear()
2 changes: 1 addition & 1 deletion deeplake/htype.py
Expand Up @@ -50,7 +50,7 @@ class htype:
HTYPE_CONFIGURATIONS: Dict[str, Dict] = {
htype.DEFAULT: {"dtype": None},
htype.IMAGE: {
"dtype": "uint8",
"dtype": None,
"intrinsics": None,
"_info": ["intrinsics"],
},
Expand Down

0 comments on commit efbc684

Please sign in to comment.