Skip to content

Commit

Permalink
Adding in some hacky code to maintain backwards compatability
Browse files Browse the repository at this point in the history
  • Loading branch information
BryanFauble committed May 16, 2024
1 parent 0818990 commit 891b7c3
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 36 deletions.
56 changes: 38 additions & 18 deletions synapseclient/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1400,8 +1400,9 @@ def delete_none_keys(incoming_object: typing.Dict) -> None:

def merge_dataclass_entities(
source: typing.Union["Project", "Folder", "File"],
destination: typing.Union["Project", "Folder"],
) -> typing.Union["Project", "Folder"]:
destination: typing.Union["Project", "Folder", "File"],
fields_to_ignore: typing.List[str] = None,
) -> typing.Union["Project", "Folder", "File"]:
"""
Utility function to merge two dataclass entities together. This is used when we are
upserting an entity from the Synapse service with the requested changes.
Expand All @@ -1420,6 +1421,8 @@ def merge_dataclass_entities(

# Update destination_dict with source_dict, keeping destination's values in case of conflicts
for key, value in source_dict.items():
if fields_to_ignore is not None and key in fields_to_ignore:
continue
if is_dataclass(getattr(source, key)):
if hasattr(destination, key):
setattr(destination, key, getattr(source, key))
Expand All @@ -1442,13 +1445,28 @@ def merge_dataclass_entities(
return destination


def merge_metadata_fields(
source: typing.Union["Project", "Folder", "File"],
destination: typing.Union["Project", "Folder", "File"],
) -> typing.Union["Project", "Folder"]:
def merge_non_modifiable_manifest_fields(
source: "File",
destination: "File",
) -> "File":
"""
Utility function to merge two dataclass entities together. This will only merge the
minimum amount of data required for an update with Synapse.
Utility function to handle special cases around merging file entities together
during a manifest upload. This is a special case due to how data persisting works
with a manifest.
To determine if a field needs to be added to this utility find where the File class
instance is being created or attributes are being placed onto that File.
1) Any columns that are specified should not be pulled forward from synapse.
They should contain to remain as None value.
2) Any columns that are not specified within the manifest file should be pulled
forward.
3) Annotations are an exception to #2, they are always replaced by the manifest.
Also to note, the manifest field names are in camelCase due to backwards
compatability. The fields in the dataclass objects on the other hand are in
snake_case.
Arguments:
source: The source entity to merge from.
Expand All @@ -1458,14 +1476,16 @@ def merge_metadata_fields(
The destination entity with the merged values.
"""
# pylint: disable=protected-access
destination._last_persistent_instance = (
destination._last_persistent_instance or source._last_persistent_instance
)
destination.id = destination.id or source.id
destination.etag = destination.etag or source.etag
destination.version_number = destination.version_number or source.version_number
destination.version_label = destination.version_label or source.version_label
destination.version_comment = destination.version_comment or source.version_comment
destination.modified_on = destination.modified_on or source.modified_on
fields_to_not_merge = ["annotations"]
if "used" or "executed" in destination._present_manifest_fields:
fields_to_not_merge.append("activity")

return destination
if "name" in destination._present_manifest_fields:
fields_to_not_merge.append("name")

if "contentType" in destination._present_manifest_fields:
fields_to_not_merge.append("content_type")

return merge_dataclass_entities(
source=source, destination=destination, fields_to_ignore=fields_to_not_merge
)
40 changes: 25 additions & 15 deletions synapseclient/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
delete_none_keys,
guess_file_name,
merge_dataclass_entities,
merge_metadata_fields,
merge_non_modifiable_manifest_fields,
run_and_attach_otel_context,
)
from synapseclient.entity import File as Synapse_File
Expand Down Expand Up @@ -250,13 +250,6 @@ class File(FileSynchronousProtocol, AccessControllable):
being restricted and the requirements of access.
This may be used only by an administrator of the specified file.
merge_with_found_resource: (Store only)
Works in conjunction with `create_or_update` in that this is only evaluated
if `create_or_update` is True. If this is True the metadata will be merged
with the existing metadata in Synapse. If False the existing metadata will
be replaced with the new metadata. When this is False any updates will act
as a destructive update.
synapse_store: (Store only)
Whether the File should be uploaded or if false: only the path should
be stored when [synapseclient.models.File.store][] is called.
Expand Down Expand Up @@ -425,16 +418,31 @@ class File(FileSynchronousProtocol, AccessControllable):
This may be used only by an administrator of the specified file.
"""

merge_with_found_resource: bool = field(default=True, repr=False, compare=False)
_merge_non_modifiable_manifest_fields: bool = field(
default=False, repr=False, compare=False
)
"""
(Store only)
Works in conjunction with `create_or_update` in that this is only evaluated if
`create_or_update` is True. If this is True the metadata will be merged with the
existing metadata in Synapse. If False the existing metadata will be replaced with
the new metadata. When this is False any updates will act as a destructive update.
`create_or_update` is True. If this is True any fields that are not modifiable
through the manifest will be merged with the existing fields from Synapse. If False
all fields will be merged from Synapse.
This is not meant to be used by end-users and is used internally during a manifest
upload. Unfortunate this is mostly a hack to maintain backwards compatability with
how manifest uploads work. The behavior this is emulating is:
1) If a column is passed into the manifest it will replace the field on the File
2) If a column is not present in the manifest it will retrieve the File metadata
from Synapse and perform an 'upsert' for that field.
Due to the behavior of how data classes work, we cannot delete fields like a dict,
this means we need to set a flag to determine if we should merge the fields or not.
"""

_present_manifest_fields: List[str] = field(default=None, repr=False, compare=False)
"""Hidden attribute to pass along what columns were present in a manifest upload."""

synapse_store: bool = field(default=True, repr=False)
"""
(Store only)
Expand Down Expand Up @@ -741,10 +749,12 @@ async def store_async(
client = Synapse.get_client(synapse_client=synapse_client)

if existing_file := await self._find_existing_file(synapse_client=client):
if self.merge_with_found_resource:
merge_dataclass_entities(source=existing_file, destination=self)
if self._merge_non_modifiable_manifest_fields:
merge_non_modifiable_manifest_fields(
source=existing_file, destination=self
)
else:
merge_metadata_fields(source=existing_file, destination=self)
merge_dataclass_entities(source=existing_file, destination=self)

if self.path:
self.path = os.path.expanduser(self.path)
Expand Down
5 changes: 4 additions & 1 deletion synapseutils/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

from .monitor import notify_me_async

# When new fields are added to the manifest they will also need to be added to
# utils.py#merge_non_modifiable_manifest_fields
REQUIRED_FIELDS = ["path", "parent"]
FILE_CONSTRUCTOR_FIELDS = ["name", "id", "synapseStore", "contentType"]
STORE_FUNCTION_FIELDS = ["activityName", "activityDescription", "forceVersion"]
Expand Down Expand Up @@ -1345,7 +1347,8 @@ async def _manifest_upload(syn: Synapse, df) -> bool:
synapse_store=row["synapseStore"] if "synapseStore" in row else True,
content_type=row["contentType"] if "contentType" in row else None,
force_version=row["forceVersion"] if "forceVersion" in row else True,
merge_with_found_resource=False,
_merge_non_modifiable_manifest_fields=True,
_present_manifest_fields=row.keys().tolist(),
)

manifest_style_annotations = dict(
Expand Down
81 changes: 79 additions & 2 deletions tests/integration/synapseutils/test_synapseutils_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ async def test_sync_to_synapse_activities_added_then_removed(
# AND none of the files have an activity
assert folder.files[0].activity is None

async def test_sync_to_synapse_activities_added_then_removed_with_version_updates(
async def test_sync_to_synapse_field_not_available_in_manifest_persisted(
self, syn: Synapse, schedule_for_cleanup, project_model: Project
) -> None:
# GIVEN a folder to sync to
Expand Down Expand Up @@ -472,18 +472,95 @@ async def test_sync_to_synapse_activities_added_then_removed_with_version_update
assert len(folder.files) == 1

# AND the file is the one we uploaded
assert folder.files[0].version_number == 1
assert folder.files[0].path == temp_file
assert folder.files[0].activity.name == BOGUS_ACTIVITY
assert folder.files[0].activity.description == BOGUS_DESCRIPTION
assert len(folder.files[0].activity.used) == 1
assert folder.files[0].activity.used[0].url == SYNAPSE_URL

# WHEN I update a metadata field on the File not available in the manifest
folder.files[0].description = "new file description"
await folder.files[0].store_async()
assert folder.files[0].version_number == 2

# WHEN I update the manifest file to remove the activities
df = pd.DataFrame(
{
"path": [temp_file],
"parent": folder.id,
"some_new_annotation": "asdf",
"used": "",
"executed": "",
"activityName": "",
"activityDescription": "",
}
)
# Write the df to the file:
file_name = write_df_to_tsv(df, schedule_for_cleanup)

# AND I sync the manifest to Synapse
synapseutils.syncToSynapse(syn, file_name, sendMessages=SEND_MESSAGE, retries=2)

# THEN I expect that the folder has all of the files
await folder.sync_from_synapse_async(download_file=False)
assert len(folder.files) == 1

# AND the file is the one we uploaded
assert folder.files[0].version_number == 2
assert folder.files[0].path == temp_file
# AND none of the files have an activity
assert folder.files[0].activity is None
# AND The metadata field updated is still present
assert folder.files[0].description == "new file description"

async def test_sync_to_synapse_activities_added_then_removed_with_version_updates(
self, syn: Synapse, schedule_for_cleanup, project_model: Project
) -> None:
# GIVEN a folder to sync to
folder = await Folder(
name=str(uuid.uuid4()), parent_id=project_model.id
).store_async()
schedule_for_cleanup(folder.id)

# AND temporary file on disk:
temp_file = utils.make_bogus_uuid_file()
schedule_for_cleanup(temp_file)

# AND A manifest file with the paths to the temp files exists
df = pd.DataFrame(
{
"path": [temp_file],
"parent": folder.id,
"used": SYNAPSE_URL,
"executed": "",
"activityName": BOGUS_ACTIVITY,
"activityDescription": BOGUS_DESCRIPTION,
}
)
# Write the df to the file:
file_name = write_df_to_tsv(df, schedule_for_cleanup)

# WHEN I sync the manifest to Synapse
synapseutils.syncToSynapse(syn, file_name, sendMessages=SEND_MESSAGE, retries=2)

# THEN I expect that the folder has all of the files
await folder.sync_from_synapse_async(download_file=False)
assert len(folder.files) == 1

# AND the file is the one we uploaded
assert folder.files[0].path == temp_file
assert folder.files[0].activity.name == BOGUS_ACTIVITY
assert folder.files[0].activity.description == BOGUS_DESCRIPTION
assert len(folder.files[0].activity.used) == 1
assert folder.files[0].activity.used[0].url == SYNAPSE_URL

# WHEN I update the manifest file to remove the activities and update a metadata
# field
df = pd.DataFrame(
{
"path": [temp_file],
"parent": folder.id,
"contentType": "text/html",
"used": "",
"executed": "",
"activityName": "",
Expand Down

0 comments on commit 891b7c3

Please sign in to comment.