Skip to content

Commit

Permalink
Fix duplicate artifact save
Browse files Browse the repository at this point in the history
  • Loading branch information
kartik4949 authored and fnikolai committed Apr 30, 2024
1 parent f104ca0 commit aea9008
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added CORS Policy regarding REST server implementation
- Fixed some bugs in multimodal usecase
- Fixed File datatype
- Fixed a bug in artifact store to skip duplicate artifacts


## [0.1.1](https://github.com/SuperDuperDB/superduperdb/compare/0.0.20...0.1.0]) (2023-Feb-09)
Expand Down
10 changes: 9 additions & 1 deletion superduperdb/backends/base/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import typing as t
from abc import ABC, abstractmethod

from superduperdb import logging


def _construct_file_id_from_uri(uri):
return str(hashlib.sha1(uri.encode()).hexdigest())
Expand Down Expand Up @@ -122,7 +124,13 @@ def save_artifact(self, r: t.Dict):
file_id = r.get('sha1') or hashlib.sha1(r['bytes']).hexdigest()
if r.get('directory'):
file_id = os.path.join(datatype.directory, file_id)
self._save_bytes(r['bytes'], file_id=file_id)
try:
self._save_bytes(r['bytes'], file_id=file_id)
except FileExistsError:
logging.warn(
f'Artifact with file_id {file_id} already exists, skipping...'
)

del r['bytes']
r['file_id'] = file_id
return r
Expand Down
5 changes: 4 additions & 1 deletion superduperdb/backends/local/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ def _save_bytes(
serialized: bytes,
file_id: str,
) -> t.Any:
with open(os.path.join(self.conn, file_id), 'wb') as f:
path = os.path.join(self.conn, file_id)
if os.path.exists(path):
raise FileExistsError
with open(path, 'wb') as f:
f.write(serialized)

def _load_bytes(self, file_id: str) -> bytes:
Expand Down
3 changes: 3 additions & 0 deletions superduperdb/backends/mongodb/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def _load_file(self, file_id: str) -> str:
return download(file_id, self.filesystem)

def _save_bytes(self, serialized: bytes, file_id: str):
cur = self.filesystem.find_one({'filename': file_id})
if cur is not None:
raise FileExistsError
return self.filesystem.put(
serialized, filename=file_id, metadata={"file_id": file_id}
)
Expand Down
31 changes: 31 additions & 0 deletions test/unittest/backends/local/test_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ class TestComponent(Component):
)


@dc.dataclass(kw_only=True)
class TestComponentBytes(Component):
function: callable
type_id: t.ClassVar[str] = "TestComponent"

_artifacts: t.ClassVar[t.Sequence[t.Tuple[str, "DataType"]]] = (
("path", file_lazy),
)


@pytest.fixture
def random_directory(tmpdir):
tmpdir_path = os.path.join(tmpdir, "test_data")
Expand Down Expand Up @@ -97,3 +107,24 @@ def test_save_and_load_file(db, artifact_store: FileSystemArtifactStore):
)
# assert that the file sizes are the same
assert filecmp.cmp(test_component.path, test_component_loaded.path)


@pytest.mark.parametrize("db", [DBConfig.mongodb_empty], indirect=True)
def test_duplicate_artifact(capfd, db, artifact_store: FileSystemArtifactStore):
db.artifact_store = artifact_store

def my_func(x):
return x

test_component = TestComponentBytes(function=my_func, identifier="test")
db.add(test_component)
test_component_loaded = db.load("TestComponent", "test")
test_component_loaded.init()

# This should use skip the artifact since it exists
test_component = TestComponentBytes(function=my_func, identifier="test")
db.add(test_component)
out, _ = capfd.readouterr()

assert "Artifact with file_id" in out
assert "already exists, skipping" in out

0 comments on commit aea9008

Please sign in to comment.