Skip to content

Commit

Permalink
Merge pull request #1060 from activeloopai/task/2.0/hub-delete
Browse files Browse the repository at this point in the history
Add static dataset delete
  • Loading branch information
benchislett committed Aug 2, 2021
2 parents 83cf9ea + c3fdc2e commit a308531
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 7 deletions.
34 changes: 28 additions & 6 deletions hub/api/dataset.py
@@ -1,10 +1,13 @@
from hub.util.exceptions import DatasetHandlerError
from hub.util.storage import get_storage_and_cache_chain
import hub
from typing import Optional, Union
from hub.constants import DEFAULT_LOCAL_CACHE_SIZE, DEFAULT_MEMORY_CACHE_SIZE, MB
from hub.core.dataset import Dataset

from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, MB
from hub.client.log import logger
from hub.util.keys import dataset_exists
from hub.util.bugout_reporter import hub_reporter
from hub.util.exceptions import DatasetHandlerError
from hub.util.storage import get_storage_and_cache_chain, storage_provider_from_path
from hub.core.dataset import Dataset


class dataset:
Expand Down Expand Up @@ -189,8 +192,27 @@ def load(
@staticmethod
@hub_reporter.record_call
def delete(path: str, force: bool = False, large_ok: bool = False) -> None:
"""Deletes a dataset"""
raise NotImplementedError
"""Deletes a dataset at a given path.
This is an IRREVERSIBLE operation. Data once deleted can not be recovered.
Args:
path (str): The path to the dataset to be deleted.
force (bool): Delete data regardless of whether
it looks like a hub dataset. All data at the path will be removed.
large_ok (bool): Delete datasets larger than 1GB. Disabled by default.
"""

try:
ds = hub.load(path)
ds.delete(large_ok=large_ok)
except:
if force:
base_storage = storage_provider_from_path(
path, creds={}, read_only=False, token=None
)
base_storage.clear()
else:
raise

@staticmethod
@hub_reporter.record_call
Expand Down
40 changes: 40 additions & 0 deletions hub/api/tests/test_api.py
Expand Up @@ -7,8 +7,11 @@
from hub.util.exceptions import (
TensorDtypeMismatchError,
TensorInvalidSampleShapeError,
DatasetHandlerError,
UnsupportedCompressionError,
)
from hub.constants import MB

from click.testing import CliRunner
from hub.tests.dataset_fixtures import (
enabled_datasets,
Expand Down Expand Up @@ -555,3 +558,40 @@ def test_tensor_creation_fail_recovery():
assert list(ds.tensors) == ["x", "y"]
ds.create_tensor("z")
assert list(ds.tensors) == ["x", "y", "z"]


def test_dataset_delete():
with CliRunner().isolated_filesystem():
os.mkdir("test")
with open("test/test.txt", "w") as f:
f.write("some data")

with pytest.raises(DatasetHandlerError):
# Can't delete raw data without force
hub.dataset.delete("test/")

hub.dataset.delete("test/", force=True)
assert not os.path.isfile("test/test.txt")

hub.empty("test/").create_tensor("tmp")
assert os.path.isfile("test/dataset_meta.json")

hub.dataset.delete("test/")
assert not os.path.isfile("test/dataset_meta.json")

old_size = hub.constants.DELETE_SAFETY_SIZE
hub.constants.DELETE_SAFETY_SIZE = 1 * MB

ds = hub.empty("test/")
ds.create_tensor("data")
ds.data.extend(np.zeros((100, 2000)))

try:
hub.dataset.delete("test/")
finally:
assert os.path.isfile("test/dataset_meta.json")

hub.dataset.delete("test/", large_ok=True)
assert not os.path.isfile("test/dataset_meta.json")

hub.constants.DELETE_SAFETY_SIZE = old_size
2 changes: 2 additions & 0 deletions hub/constants.py
Expand Up @@ -35,6 +35,8 @@
DEFAULT_MEMORY_CACHE_SIZE = 256
DEFAULT_LOCAL_CACHE_SIZE = 0

# maximum allowable size before `large_ok` must be passed to dataset delete methods
DELETE_SAFETY_SIZE = 1 * GB

# meta is hub-defined information, necessary for hub Datasets/Tensors to function
DATASET_META_FILENAME = "dataset_meta.json"
Expand Down
23 changes: 22 additions & 1 deletion hub/core/dataset.py
@@ -1,3 +1,4 @@
import hub
from hub.api.info import load_info
from hub.core.storage.provider import StorageProvider
from hub.core.tensor import create_tensor, Tensor
Expand Down Expand Up @@ -395,11 +396,31 @@ def clear_cache(self):
if hasattr(self.storage, "clear_cache"):
self.storage.clear_cache()

def size_approx(self):
"""Estimates the size in bytes of the dataset.
Includes only content, so will generally return an under-estimate.
"""
tensors = self.tensors.values()
chunk_engines = [tensor.chunk_engine for tensor in tensors]
size = sum(c.num_chunks * c.min_chunk_size for c in chunk_engines)
return size

@hub_reporter.record_call
def delete(self):
def delete(self, large_ok=False):
"""Deletes the entire dataset from the cache layers (if any) and the underlying storage.
This is an IRREVERSIBLE operation. Data once deleted can not be recovered.
Args:
large_ok (bool): Delete datasets larger than 1GB. Disabled by default.
"""
if not large_ok:
size = self.size_approx()
if size > hub.constants.DELETE_SAFETY_SIZE:
logger.info(
f"Hub Dataset {self.path} was too large to delete. Try again with large_ok=True."
)
return

self.storage.clear()
if self.path.startswith("hub://"):
self.client.delete_dataset_entry(self.org_id, self.ds_name)
Expand Down

0 comments on commit a308531

Please sign in to comment.