diff --git a/darwin/client.py b/darwin/client.py index 3e04d0a83..e311917d1 100644 --- a/darwin/client.py +++ b/darwin/client.py @@ -131,7 +131,7 @@ def list_local_datasets(self, team_slug: Optional[str] = None) -> Iterator[Path] def list_remote_datasets( self, team_slug: Optional[str] = None - ) -> Iterator[RemoteDataset]: + ) -> Iterator[RemoteDatasetV2]: """ Returns a list of all available datasets with the team currently authenticated against. @@ -162,7 +162,7 @@ def list_remote_datasets( def get_remote_dataset( self, dataset_identifier: Union[str, DatasetIdentifier] - ) -> RemoteDataset: + ) -> RemoteDatasetV2: """ Get a remote dataset based on its identifier. @@ -189,7 +189,7 @@ def get_remote_dataset( parsed_dataset_identifier.team_slug = self.default_team try: - matching_datasets: List[RemoteDataset] = [ + matching_datasets: List[RemoteDatasetV2] = [ dataset for dataset in self.list_remote_datasets( team_slug=parsed_dataset_identifier.team_slug diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index a5742d65f..eabef6c2c 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -11,6 +11,7 @@ Union, ) +from pydantic import ValidationError from requests.models import Response from darwin.dataset import RemoteDataset @@ -23,11 +24,20 @@ UploadHandlerV2, ) from darwin.dataset.utils import ( + chunk_items, + get_external_file_name, get_external_file_type, is_relative_to, parse_external_file_path, ) -from darwin.datatypes import AnnotationFile, ItemId, ObjectStore, PathLike +from darwin.datatypes import ( + AnnotationFile, + ItemId, + ObjectStore, + PathLike, + StorageKeyDictModel, + StorageKeyListModel, +) from darwin.exceptions import NotFound, UnknownExportVersion from darwin.exporter.formats.darwin import build_image_annotation from darwin.item import DatasetItem @@ -562,12 +572,12 @@ def register( self, object_store: ObjectStore, storage_keys: List[str], - fps: Optional[str] = None, + fps: Optional[Union[str, float]] = None, multi_planar_view: bool = False, preserve_folders: bool = False, ) -> Dict[str, List[str]]: """ - Register files in the dataset. + Register files in the dataset in a single slot. Parameters ---------- @@ -586,7 +596,21 @@ def register( ------- Dict[str, List[str]] A dictionary with the list of registered files. + + Raises + ------ + ValueError + If ``storage_keys`` is not a list of strings. + TypeError + If the file type is not supported. """ + try: + StorageKeyListModel(storage_keys=storage_keys) + except ValidationError as e: + print( + f"Error validating storage keys: {e}\n\nPlease make sure your storage keys are a list of strings" + ) + raise e items = [] for storage_key in storage_keys: file_type = get_external_file_type(storage_key) @@ -610,9 +634,115 @@ def register( # Do not register more than 500 items in a single request chunk_size = 500 - chunked_items = ( - items[i : i + chunk_size] for i in range(0, len(items), chunk_size) + chunked_items = chunk_items(items, chunk_size) + print(f"Registering {len(items)} items in chunks of {chunk_size} items...") + results = { + "registered": [], + "blocked": [], + } + + for chunk in chunked_items: + payload = { + "items": chunk, + "dataset_slug": self.slug, + "storage_slug": object_store.name, + } + print(f"Registering {len(chunk)} items...") + response = self.client.api_v2.register_items(payload, team_slug=self.team) + for item in json.loads(response.text)["items"]: + item_info = f"Item {item['name']} registered with item ID {item['id']}" + results["registered"].append(item_info) + for item in json.loads(response.text)["blocked_items"]: + item_info = f"Item {item['name']} was blocked for the reason: {item['slots'][0]['reason']}" + results["blocked"].append(item_info) + print( + f"{len(results['registered'])} of {len(storage_keys)} items registered successfully" ) + if results["blocked"]: + print("The following items were blocked:") + for item in results["blocked"]: + print(f" - {item}") + print(f"Reistration complete. Check your items in the dataset: {self.slug}") + return results + + def register_multi_slotted( + self, + object_store: ObjectStore, + storage_keys: Dict[str, List[str]], + fps: Optional[Union[str, float]] = None, + multi_planar_view: bool = False, + preserve_folders: bool = False, + ) -> Dict[str, List[str]]: + """ + Register files in the dataset in multiple slots. + + Parameters + ---------- + object_store : ObjectStore + Object store to use for the registration. + storage_keys : Dict[str, List[str] + Storage keys to register. The keys are the item names and the values are lists of storage keys. + fps : Optional[str], default: None + When the uploading file is a video, specify its framerate. + multi_planar_view : bool, default: False + Uses multiplanar view when uploading files. + preserve_folders : bool, default: False + Specify whether or not to preserve folder paths when uploading + + Returns + ------- + Dict[str, List[str]] + A dictionary with the list of registered files. + + Raises + ------ + ValueError + If ``storage_keys`` is not a dictionary with keys as item names and values as lists of storage keys. + TypeError + If the file type is not supported. + """ + + try: + StorageKeyDictModel(storage_keys=storage_keys) + except ValidationError as e: + print( + f"Error validating storage keys: {e}\n\nPlease make sure your storage keys are a dictionary with keys as item names and values as lists of storage keys" + ) + raise e + items = [] + for item in storage_keys: + slots = [] + for storage_key in storage_keys[item]: + file_name = get_external_file_name(storage_key) + file_type = get_external_file_type(storage_key) + if not file_type: + raise TypeError( + f"Unsupported file type for the following storage key: {storage_key}.\nPlease make sure your storage key ends with one of the supported extensions:\n{SUPPORTED_EXTENSIONS}" + ) + slot = { + "slot_name": file_name, + "type": file_type, + "storage_key": storage_key, + "file_name": file_name, + } + if fps and file_type == "video": + slot["fps"] = fps + if multi_planar_view and file_type == "dicom": + slot["extract_views"] = "true" + slots.append(slot) + items.append( + { + "slots": slots, + "name": item, + "path": parse_external_file_path( + storage_keys[item][0], preserve_folders + ), + } + ) + + # Do not register more than 500 items in a single request + chunk_size = 500 + chunked_items = chunk_items(items, chunk_size) print(f"Registering {len(items)} items in chunks of {chunk_size} items...") results = { "registered": [], diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index a0d550b53..8992e6571 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -893,3 +893,41 @@ def parse_external_file_path(storage_key: str, preserve_folders: bool) -> str: if not preserve_folders: return "/" return "/" + "/".join(storage_key.split("/")[:-1]) + + +def get_external_file_name(storage_key: str) -> str: + """ + Returns the name of the file given a storage key. + + Parameters + ---------- + storage_key : str + The storage key to get the file name from. + + Returns + ------- + str + The name of the file. + """ + if "/" not in storage_key: + return storage_key + return storage_key.split("/")[-1] + + +def chunk_items(items: List[Any], chunk_size: int = 500) -> Iterator[List[Any]]: + """ + Splits the list of items into chunks of specified size. + + Parameters + ---------- + items : List[Any] + The list of items to split. + chunk_size : int, default: 500 + The size of each chunk. + + Returns + ------- + Iterator[List[Any]] + An iterator that yields lists of items, each of length ``chunk_size``. + """ + return (items[i : i + chunk_size] for i in range(0, len(items), chunk_size)) diff --git a/darwin/datatypes.py b/darwin/datatypes.py index b30e17710..e922b17c6 100644 --- a/darwin/datatypes.py +++ b/darwin/datatypes.py @@ -17,6 +17,8 @@ Union, ) +from pydantic import BaseModel + try: from numpy.typing import NDArray except ImportError: @@ -1491,3 +1493,11 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"ObjectStore(name={self.name}, prefix={self.prefix}, readonly={self.readonly}, provider={self.provider})" + + +class StorageKeyDictModel(BaseModel): + storage_keys: Dict[str, List[str]] + + +class StorageKeyListModel(BaseModel): + storage_keys: List[str] diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index dc797dbb0..aa133b17b 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -14,6 +14,7 @@ from darwin.dataset.release import Release from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2 from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2 +from darwin.datatypes import ObjectStore from darwin.exceptions import UnsupportedExportFormat, UnsupportedFileType from darwin.item import DatasetItem from tests.fixtures import * @@ -871,3 +872,163 @@ def test_honours_include_authorship(self, remote_dataset: RemoteDatasetV2): include_url_token=False, include_authorship=True, ) + + +@pytest.mark.usefixtures("file_read_write_test") +class TestRegister: + def test_raises_if_storage_keys_not_list_of_strings( + self, remote_dataset: RemoteDatasetV2 + ): + with pytest.raises(ValueError): + remote_dataset.register( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + [1, 2, 3], + ) + + def test_raises_if_unsupported_file_type(self, remote_dataset: RemoteDatasetV2): + with pytest.raises(TypeError): + remote_dataset.register( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + ["unsupported_file.xyz"], + ) + + @responses.activate + def test_register_files(self, remote_dataset: RemoteDatasetV2): + responses.add( + responses.POST, + "http://localhost/api/v2/teams/v7-darwin-json-v2/items/register_existing", + json={ + "items": [{"id": "1", "name": "test.jpg"}], + "blocked_items": [], + }, + status=200, + ) + result = remote_dataset.register( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + ["test.jpg"], + ) + assert len(result["registered"]) == 1 + assert len(result["blocked"]) == 0 + + @responses.activate + def test_register_files_with_blocked_items(self, remote_dataset: RemoteDatasetV2): + responses.add( + responses.POST, + "http://localhost/api/v2/teams/v7-darwin-json-v2/items/register_existing", + json={ + "items": [], + "blocked_items": [ + {"name": "test.jpg", "slots": [{"reason": "test reason"}]} + ], + }, + status=200, + ) + result = remote_dataset.register( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + ["test.jpg"], + ) + assert len(result["registered"]) == 0 + assert len(result["blocked"]) == 1 + + +@pytest.mark.usefixtures("file_read_write_test") +class TestRegisterMultiSlotted: + def test_raises_if_storage_keys_not_dictionary( + self, remote_dataset: RemoteDatasetV2 + ): + with pytest.raises(ValueError): + remote_dataset.register_multi_slotted( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + {"item1": [1, 2, 3]}, + ) + + def test_raises_if_unsupported_file_type(self, remote_dataset: RemoteDatasetV2): + with pytest.raises(TypeError): + remote_dataset.register_multi_slotted( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + {"item1": ["unsupported_file.xyz"]}, + ) + + @responses.activate + def test_register_files(self, remote_dataset: RemoteDatasetV2): + responses.add( + responses.POST, + "http://localhost/api/v2/teams/v7-darwin-json-v2/items/register_existing", + json={ + "items": [{"id": "1", "name": "test.jpg"}], + "blocked_items": [], + }, + status=200, + ) + result = remote_dataset.register_multi_slotted( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + {"item1": ["test.jpg"]}, + ) + assert len(result["registered"]) == 1 + assert len(result["blocked"]) == 0 + + @responses.activate + def test_register_files_with_blocked_items(self, remote_dataset: RemoteDatasetV2): + responses.add( + responses.POST, + "http://localhost/api/v2/teams/v7-darwin-json-v2/items/register_existing", + json={ + "items": [], + "blocked_items": [ + {"name": "test.jpg", "slots": [{"reason": "test reason"}]} + ], + }, + status=200, + ) + remote_dataset.register_multi_slotted( + ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ), + {"item1": ["test.jpg"]}, + ) diff --git a/tests/darwin/datatypes_test.py b/tests/darwin/datatypes_test.py index 569f8e285..13852c493 100644 --- a/tests/darwin/datatypes_test.py +++ b/tests/darwin/datatypes_test.py @@ -6,7 +6,11 @@ import pytest +from darwin.client import Client +from darwin.config import Config +from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2 from darwin.datatypes import ( + ObjectStore, Point, make_complex_polygon, make_polygon, @@ -128,3 +132,60 @@ def test_split_paths_by_manifest( len(property_class.properties or []) for property_class in property_classes or [] ] == properties_n + + +class TestObjectStore: + @pytest.fixture + def object_store(self): + return ObjectStore( + name="test", + prefix="test_prefix", + readonly=False, + provider="aws", + default=True, + ) + + @pytest.fixture + def darwin_client( + darwin_config_path: Path, + darwin_datasets_path: Path, + team_slug_darwin_json_v2: str, + ) -> Client: + config = Config(darwin_config_path) + config.put(["global", "api_endpoint"], "http://localhost/api") + config.put(["global", "base_url"], "http://localhost") + config.put(["teams", team_slug_darwin_json_v2, "api_key"], "mock_api_key") + config.put( + ["teams", team_slug_darwin_json_v2, "datasets_dir"], + str(darwin_datasets_path), + ) + return Client(config) + + @pytest.fixture + def remote_dataset_v2(self): + return RemoteDatasetV2( + client=self.darwin_client, + team="test_team", + name="Test dataset", + slug="test-dataset", + dataset_id=1, + ) + + def test_init(self, object_store): + assert object_store.name == "test" + assert object_store.prefix == "test_prefix" + assert object_store.readonly is False + assert object_store.provider == "aws" + assert object_store.default is True + + def test_str(self, object_store): + assert ( + str(object_store) + == "Storage configuration:\n- Name: test\n- Prefix: test_prefix\n- Readonly: False\n- Provider: aws\n- Default: True" + ) + + def test_repr(self, object_store): + assert ( + repr(object_store) + == "ObjectStore(name=test, prefix=test_prefix, readonly=False, provider=aws)" + ) diff --git a/tests/darwin/objectstore_test.py b/tests/darwin/objectstore_test.py deleted file mode 100644 index 43ede8f9f..000000000 --- a/tests/darwin/objectstore_test.py +++ /dev/null @@ -1,65 +0,0 @@ -from pathlib import Path - -import pytest - -from darwin.client import Client -from darwin.config import Config -from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2 -from darwin.datatypes import ObjectStore - - -class TestObjectStore: - @pytest.fixture - def object_store(self): - return ObjectStore( - name="test", - prefix="test_prefix", - readonly=False, - provider="aws", - default=True, - ) - - @pytest.fixture - def darwin_client( - darwin_config_path: Path, - darwin_datasets_path: Path, - team_slug_darwin_json_v2: str, - ) -> Client: - config = Config(darwin_config_path) - config.put(["global", "api_endpoint"], "http://localhost/api") - config.put(["global", "base_url"], "http://localhost") - config.put(["teams", team_slug_darwin_json_v2, "api_key"], "mock_api_key") - config.put( - ["teams", team_slug_darwin_json_v2, "datasets_dir"], - str(darwin_datasets_path), - ) - return Client(config) - - @pytest.fixture - def remote_dataset_v2(self): - return RemoteDatasetV2( - client=self.darwin_client, - team="test_team", - name="Test dataset", - slug="test-dataset", - dataset_id=1, - ) - - def test_init(self, object_store): - assert object_store.name == "test" - assert object_store.prefix == "test_prefix" - assert object_store.readonly is False - assert object_store.provider == "aws" - assert object_store.default is True - - def test_str(self, object_store): - assert ( - str(object_store) - == "Storage configuration:\n- Name: test\n- Prefix: test_prefix\n- Readonly: False\n- Provider: aws\n- Default: True" - ) - - def test_repr(self, object_store): - assert ( - repr(object_store) - == "ObjectStore(name=test, prefix=test_prefix, readonly=False, provider=aws)" - )