v7labs · JBWilkie · Mar 12, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/darwin/backend_v2.py b/darwin/backend_v2.py
@@ -237,3 +237,19 @@ def import_annotation(
         return self._client._post_raw(
             f"v2/teams/{team_slug}/items/{item_id}/import", payload=payload
         )
+
+    @inject_default_team_slug
+    def register_items(self, payload: Dict[str, Any], team_slug: str) -> None:
+        """
+        Register items from external storage.
+
+        Parameters
+        ----------
+        payload: JSONDict
+            The payload to register items from external storage.
+        team_slug: str
+            The team slug.
+        """
+        return self._client._post_raw(
+            f"/v2/teams/{team_slug}/items/register_existing", payload
+        )
diff --git a/darwin/client.py b/darwin/client.py
@@ -16,7 +16,13 @@
 from darwin.dataset.identifier import DatasetIdentifier
 from darwin.dataset.remote_dataset import RemoteDataset
 from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2
-from darwin.datatypes import DarwinVersionNumber, Feature, Team, UnknownType
+from darwin.datatypes import (
+    DarwinVersionNumber,
+    Feature,
+    ObjectStore,
+    Team,
+    UnknownType,
+)
 from darwin.exceptions import (
     InsufficientStorage,
     InvalidLogin,
@@ -1054,3 +1060,96 @@ def update_property(
             team_slug=team_slug or self.default_team,
             params=params,
         )
+
+    def get_external_storage(
+        self, team_slug: Optional[str] = None, name: Optional[str] = None
+    ) -> Optional[ObjectStore]:
+        """
+        Get an external storage connection by name.
+
+        If no name is provided, the default team's external storage connection will be returned.
+
+        Parameters
+        ----------
+        team_slug: Optional[str]
+            The team slug.
+        name: Optional[str]
+            The name of the external storage connection.
+
+        Returns
+        -------
+        Optional[ObjectStore]
+            The external storage connection with the given name.
+
+        Raises
+        ------
+        ValueError
+            If no external storage connection is found in the team.
+
+        ValueError
+            If no name is provided and the default external storage connection is read-only.
+
+        ValueError
+            If provided connection name is read-only.
+        """
+        if not team_slug:
+            team_slug = self.default_team
+
+        connections = self.list_external_storage_connections(team_slug)
+        if not connections:
+            raise ValueError(
+                f"No external storage connections found in the team: {team_slug}. Please configure one.\n\nGuidelines can be found here: https://docs.v7labs.com/docs/external-storage-configuration"
+            )
+
+        # If no name is provided, return the default connection
+        if name is None:
+            for connection in connections:
+                if connection.default:
+                    if connection.readonly:
+                        raise ValueError(
+                            "The default external storage connection is read-only. darwin-py only supports read-write configuration.\n\nPlease use the REST API to register items from read-only storage: https://docs.v7labs.com/docs/registering-items-from-external-storage#read-only-registration"
+                        )
+                    return connection
+
+        # If a name is provided, return the connection with the given name
+        for connection in connections:
+            if connection.name == name:
+                if connection.readonly:
+                    raise ValueError(
+                        "The selected external storage connection is read-only. darwin-py only supports read-write configuraiton.\n\nPlease use the REST API to register items from read-only storage: https://docs.v7labs.com/docs/registering-items-from-external-storage#read-only-registration"
+                    )
+                return connection
+
+        raise ValueError(
+            f"No external storage connection found with the name: {name} in the team {team_slug}. Please configure one.\n\nGuidelines can be found at https://docs.v7labs.com/docs/external-storage-configuration"
+        )
+
+    def list_external_storage_connections(self, team_slug: str) -> List[ObjectStore]:
+        """
+        Returns a list of all available external storage connections.
+
+        Parameters
+        ----------
+        team_slug: str
+            The team slug.
+
+        Returns
+        -------
+        List[ObjectStore]
+            A list of all available external storage connections.
+        """
+        response: List[Dict[str, UnknownType]] = cast(
+            List[Dict[str, UnknownType]],
+            self._get(f"/teams/{team_slug}/storage"),
+        )
+
+        return [
+            ObjectStore(
+                name=connection["name"],
+                prefix=connection["prefix"],
+                readonly=connection["readonly"],
+                provider=connection["provider"],
+                default=connection["default"],
+            )
+            for connection in response
+        ]
diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py
@@ -137,7 +137,7 @@ def _setup_annotations_and_images(
         keep_empty_annotations: bool = False,
     ):
         # Find all the annotations and their corresponding images
-        with_folders = any([item.is_dir() for item in images_dir.iterdir()])
+        with_folders = any(item.is_dir() for item in images_dir.iterdir())
         annotation_filepaths = get_annotation_filepaths(
             release_path, annotations_dir, annotation_type, split, partition, split_type
         )

diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py
@@ -1,3 +1,4 @@
+import json
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -21,13 +22,17 @@
     UploadHandler,
     UploadHandlerV2,
 )
-from darwin.dataset.utils import is_relative_to
-from darwin.datatypes import AnnotationFile, ItemId, PathLike
+from darwin.dataset.utils import (
+    get_external_file_type,
+    is_relative_to,
+    parse_external_file_path,
+)
+from darwin.datatypes import AnnotationFile, ItemId, ObjectStore, PathLike
 from darwin.exceptions import NotFound, UnknownExportVersion
 from darwin.exporter.formats.darwin import build_image_annotation
 from darwin.item import DatasetItem
 from darwin.item_sorter import ItemSorter
-from darwin.utils import find_files, urljoin
+from darwin.utils import SUPPORTED_EXTENSIONS, find_files, urljoin
 
 if TYPE_CHECKING:
     from darwin.client import Client
@@ -552,3 +557,88 @@ def _build_image_annotation(
         self, annotation_file: AnnotationFile, team_name: str
     ) -> Dict[str, Any]:
         return build_image_annotation(annotation_file, team_name)
+
+    def register(
+        self,
+        object_store: ObjectStore,
+        storage_keys: List[str],
+        fps: Optional[str] = None,
+        multi_planar_view: bool = False,
+        preserve_folders: bool = False,
+    ) -> Dict[str, List[str]]:
+        """
+        Register files in the dataset.
+
+        Parameters
+        ----------
+        object_store : ObjectStore
+            Object store to use for the registration.
+        storage_keys : List[str]
+            List of storage keys to register.
+        fps : Optional[str], default: None
+            When the uploading file is a video, specify its framerate.
+        multi_planar_view : bool, default: False
+            Uses multiplanar view when uploading files.
+        preserve_folders : bool, default: False
+            Specify whether or not to preserve folder paths when uploading
+
+        Returns
+        -------
+        Dict[str, List[str]]
+            A dictionary with the list of registered files.
+        """
+        items = []
+        for storage_key in storage_keys:
+            file_type = get_external_file_type(storage_key)
+            if not file_type:
+                raise TypeError(
+                    f"Unsupported file type for the following storage key: {storage_key}.\nPlease make sure your storage key ends with one of the supported extensions:\n{SUPPORTED_EXTENSIONS}"
+                )
+            item = {
+                "path": parse_external_file_path(storage_key, preserve_folders),
+                "type": file_type,
+                "storage_key": storage_key,
+                "name": (
+                    storage_key.split("/")[-1] if "/" in storage_key else storage_key
+                ),
+            }
+            if fps and file_type == "video":
+                item["fps"] = fps
+            if multi_planar_view and file_type == "dicom":
+                item["extract_views"] = "true"
+            items.append(item)
+
+        # Do not register more than 500 items in a single request
+        chunk_size = 500
+        chunked_items = (
+            items[i : i + chunk_size] for i in range(0, len(items), chunk_size)
+        )
+        print(f"Registering {len(items)} items in chunks of {chunk_size} items...")
+        results = {
+            "registered": [],
+            "blocked": [],
+        }
+
+        for chunk in chunked_items:
+            payload = {
+                "items": chunk,
+                "dataset_slug": self.slug,
+                "storage_slug": object_store.name,
+            }
+            print(f"Registering {len(chunk)} items...")
+            response = self.client.api_v2.register_items(payload, team_slug=self.team)
+            for item in json.loads(response.text)["items"]:
+                item_info = f"Item {item['name']} registered with item ID {item['id']}"
+                results["registered"].append(item_info)
+            for item in json.loads(response.text)["blocked_items"]:
+                item_info = f"Item {item['name']} was blocked for the reason: {item['slots'][0]['reason']}"
+                results["blocked"].append(item_info)
+        print(
+            f"{len(results['registered'])} of {len(storage_keys)} items registered successfully"
+        )
+        if results["blocked"]:
+            print("The following items were blocked:")
+            for item in results["blocked"]:
+                print(f"  - {item}")
+        print(f"Reistration complete. Check your items in the dataset: {self.slug}")
+        return results
diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py
@@ -15,6 +15,7 @@
 from darwin.importer.formats.darwin import parse_path
 from darwin.utils import (
     SUPPORTED_EXTENSIONS,
+    SUPPORTED_IMAGE_EXTENSIONS,
     SUPPORTED_VIDEO_EXTENSIONS,
     attempt_decode,
     get_image_path_from_stream,
@@ -366,9 +367,11 @@ def create_polygon_object(obj, box_mode, classes=None):
         "segmentation": segmentation,
         "bbox": [np.min(all_px), np.min(all_py), np.max(all_px), np.max(all_py)],
         "bbox_mode": box_mode,
-        "category_id": classes.index(obj.annotation_class.name)
-        if classes
-        else obj.annotation_class.name,
+        "category_id": (
+            classes.index(obj.annotation_class.name)
+            if classes
+            else obj.annotation_class.name
+        ),
         "iscrowd": 0,
     }
 
@@ -380,9 +383,11 @@ def create_bbox_object(obj, box_mode, classes=None):
     new_obj = {
         "bbox": [bbox["x"], bbox["y"], bbox["x"] + bbox["w"], bbox["y"] + bbox["h"]],
         "bbox_mode": box_mode,
-        "category_id": classes.index(obj.annotation_class.name)
-        if classes
-        else obj.annotation_class.name,
+        "category_id": (
+            classes.index(obj.annotation_class.name)
+            if classes
+            else obj.annotation_class.name
+        ),
         "iscrowd": 0,
     }
 
@@ -840,3 +845,51 @@ def sanitize_filename(filename: str) -> str:
         filename = filename.replace(char, "_")
 
     return filename
+
+
+def get_external_file_type(storage_key: str) -> Optional[str]:
+    """
+    Returns the type of file given a storage key.
+
+    Parameters
+    ----------
+    storage_key : str
+        The storage key to get the type of file from.
+
+    Returns
+    -------
+    Optional[str]
+        The type of file, or ``None`` if the file type is not supported.
+    """
+    for extension in SUPPORTED_IMAGE_EXTENSIONS:
+        if storage_key.endswith(extension):
+            return "image"
+    if storage_key.endswith(".pdf"):
+        return "pdf"
+    if storage_key.endswith(".dcm"):
+        return "dicom"
+    for extension in SUPPORTED_VIDEO_EXTENSIONS:
+        if storage_key.endswith(extension):
+            return "video"
+    return None
+
+
+def parse_external_file_path(storage_key: str, preserve_folders: bool) -> str:
+    """
+    Returns the Darwin dataset path given a storage key.
+
+    Parameters
+    ----------
+    storage_key : str
+        The storage key to parse.
+    preserve_folders : bool
+        Whether to preserve folders or place the file in the Dataset root.
+
+    Returns
+    -------
+    str
+        The parsed external file path.
+    """
+    if not preserve_folders:
+        return "/"
+    return "/" + "/".join(storage_key.split("/")[:-1])
diff --git a/darwin/datatypes.py b/darwin/datatypes.py
@@ -1462,3 +1462,35 @@ class SegmentManifest:
     segment: int
     total_frames: int
     items: List[ManifestItem]
+
+
+class ObjectStore:
+    """
+    Object representing a configured conection to an external storage locaiton
+
+    Attributes:
+        name (str): The alias of the storage connection
+        prefix (str): The directory that files are written back to in the storage location
+        readonly (bool): Whether the storage configuration is read-only or not
+        self.provider (str): The cloud provider (aws, azure, or gcp)
+    """
+
+    def __init__(
+        self,
+        name: str,
+        prefix: str,
+        readonly: bool,
+        provider: str,
+        default: bool,
+    ) -> None:
+        self.name = name
+        self.prefix = prefix
+        self.readonly = readonly
+        self.provider = provider
+        self.default = default
+
+    def __str__(self) -> str:
+        return f"Storage configuration:\n- Name: {self.name}\n- Prefix: {self.prefix}\n- Readonly: {self.readonly}\n- Provider: {self.provider}\n- Default: {self.default}"
+
+    def __repr__(self) -> str:
+        return f"ObjectStore(name={self.name}, prefix={self.prefix}, readonly={self.readonly}, provider={self.provider})"
diff --git a/tests/darwin/cli_functions_test.py b/tests/darwin/cli_functions_test.py
@@ -47,7 +47,7 @@ def test_default_non_verbose(
         remote_dataset: RemoteDataset,
         request_upload_endpoint: str,
     ):
-        request_upload_response = response = {
+        request_upload_response = {
             "blocked_items": [
                 {
                     "id": "3b241101-e2bb-4255-8caf-4136c566a964",
@@ -150,7 +150,7 @@ def test_with_verbose_flag(
         remote_dataset: RemoteDataset,
         request_upload_endpoint: str,
     ):
-        request_upload_response = response = {
+        request_upload_response = {
             "blocked_items": [
                 {
                     "id": "3b241101-e2bb-4255-8caf-4136c566a964",