New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PLA-699][external] Allow registration of multi-slotted read-write files from external storage #790
Changes from 6 commits
ff67b48
7ce1853
0e592f0
a702495
98e72a9
a5593a6
bc90fcd
aba2aa7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
UploadHandlerV2, | ||
) | ||
from darwin.dataset.utils import ( | ||
get_external_file_name, | ||
get_external_file_type, | ||
is_relative_to, | ||
parse_external_file_path, | ||
|
@@ -562,12 +563,12 @@ def register( | |
self, | ||
object_store: ObjectStore, | ||
storage_keys: List[str], | ||
fps: Optional[str] = None, | ||
fps: Optional[Union[str, float]] = None, | ||
multi_planar_view: bool = False, | ||
preserve_folders: bool = False, | ||
) -> Dict[str, List[str]]: | ||
""" | ||
Register files in the dataset. | ||
Register files in the dataset in a single slot. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -586,7 +587,18 @@ def register( | |
------- | ||
Dict[str, List[str]] | ||
A dictionary with the list of registered files. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If ``storage_keys`` is not a list of strings. | ||
TypeError | ||
If the file type is not supported. | ||
""" | ||
if not isinstance(storage_keys, list) or not all( | ||
isinstance(item, str) for item in storage_keys | ||
): | ||
raise ValueError("storage_keys must be a list of strings") | ||
items = [] | ||
for storage_key in storage_keys: | ||
file_type = get_external_file_type(storage_key) | ||
|
@@ -642,3 +654,113 @@ def register( | |
print(f" - {item}") | ||
print(f"Reistration complete. Check your items in the dataset: {self.slug}") | ||
return results | ||
|
||
def register_multi_slotted( | ||
self, | ||
object_store: ObjectStore, | ||
storage_keys: Dict[str, List[str]], | ||
fps: Optional[Union[str, float]] = None, | ||
multi_planar_view: bool = False, | ||
preserve_folders: bool = False, | ||
) -> Dict[str, List[str]]: | ||
""" | ||
Register files in the dataset in multiple slots. | ||
|
||
Parameters | ||
---------- | ||
object_store : ObjectStore | ||
Object store to use for the registration. | ||
storage_keys : Dict[str, List[str] | ||
Storage keys to register. The keys are the item names and the values are lists of storage keys. | ||
fps : Optional[str], default: None | ||
When the uploading file is a video, specify its framerate. | ||
multi_planar_view : bool, default: False | ||
Uses multiplanar view when uploading files. | ||
preserve_folders : bool, default: False | ||
Specify whether or not to preserve folder paths when uploading | ||
|
||
Returns | ||
------- | ||
Dict[str, List[str]] | ||
A dictionary with the list of registered files. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If ``storage_keys`` is not a dictionary with keys as item names and values as lists of storage keys. | ||
TypeError | ||
If the file type is not supported. | ||
""" | ||
if not isinstance(storage_keys, dict) or not all( | ||
isinstance(v, list) and all(isinstance(i, str) for i in v) | ||
for v in storage_keys.values() | ||
): | ||
raise ValueError( | ||
"storage_keys must be a dictionary with keys as item names and values as lists of storage keys." | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pydantic models could be very efficient here! - but i guess we only use pydantic in darwin-py 2.0 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a really good point. We do, but I don't see a problem with using models here We could do something like:
and then:
What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perfect, that'd work! 🙏 |
||
items = [] | ||
|
||
for item in storage_keys: | ||
slots = [] | ||
for storage_key in storage_keys[item]: | ||
file_name = get_external_file_name(storage_key) | ||
file_type = get_external_file_type(storage_key) | ||
if not file_type: | ||
raise TypeError( | ||
f"Unsupported file type for the following storage key: {storage_key}.\nPlease make sure your storage key ends with one of the supported extensions:\n{SUPPORTED_EXTENSIONS}" | ||
) | ||
slot = { | ||
"slot_name": file_name, | ||
"type": file_type, | ||
"storage_key": storage_key, | ||
"file_name": file_name, | ||
} | ||
if fps and file_type == "video": | ||
slot["fps"] = fps | ||
if multi_planar_view and file_type == "dicom": | ||
slot["extract_views"] = "true" | ||
slots.append(slot) | ||
items.append( | ||
{ | ||
"slots": slots, | ||
"name": item, | ||
"path": parse_external_file_path( | ||
storage_keys[item][0], preserve_folders | ||
), | ||
} | ||
) | ||
|
||
# Do not register more than 500 items in a single request | ||
chunk_size = 500 | ||
chunked_items = ( | ||
items[i : i + chunk_size] for i in range(0, len(items), chunk_size) | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I've seen the chunking/batching function before somewhere - we can create a new fn and re-use it? |
||
print(f"Registering {len(items)} items in chunks of {chunk_size} items...") | ||
results = { | ||
"registered": [], | ||
"blocked": [], | ||
} | ||
|
||
for chunk in chunked_items: | ||
payload = { | ||
"items": chunk, | ||
"dataset_slug": self.slug, | ||
"storage_slug": object_store.name, | ||
} | ||
print(f"Registering {len(chunk)} items...") | ||
response = self.client.api_v2.register_items(payload, team_slug=self.team) | ||
for item in json.loads(response.text)["items"]: | ||
item_info = f"Item {item['name']} registered with item ID {item['id']}" | ||
results["registered"].append(item_info) | ||
for item in json.loads(response.text)["blocked_items"]: | ||
item_info = f"Item {item['name']} was blocked for the reason: {item['slots'][0]['reason']}" | ||
results["blocked"].append(item_info) | ||
print( | ||
f"{len(results['registered'])} of {len(storage_keys)} items registered successfully" | ||
) | ||
if results["blocked"]: | ||
print("The following items were blocked:") | ||
for item in results["blocked"]: | ||
print(f" - {item}") | ||
print(f"Reistration complete. Check your items in the dataset: {self.slug}") | ||
return results |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These type changes aren't functionally significant, but since we no longer have
RemoteDatasetV1
they make some typechecker errors less likelyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are still some places (docstrings/types) where we use
RemoteDataset
- should we change all of them 🤔There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably, although I don't think it's a high priority change. I made the change here because I noticed Pylance type checking complaining, but didn't comb the rest of the repo
If you agree, I'd like to do that at some point down the line