From eb37aedf980b74929b6f98a0bbe9e757672cacc1 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 22 Mar 2024 12:32:14 -0600 Subject: [PATCH 1/3] Optimize writes to existing Zarr stores. We need to read existing variables to make sure we append or write to a region with the right encoding. Currently we request all arrays in a Zarr group. Instead only request those arrays for which we require encoding information. --- xarray/backends/zarr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e9465dc0ba0..baf38d2ad16 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -623,7 +623,12 @@ def store( # avoid needing to load index variables into memory. # TODO: consider making loading indexes lazy again? existing_vars, _, _ = conventions.decode_cf_variables( - self.get_variables(), self.get_attrs() + { + k: v + for k, v in self.get_variables().items() + if k in existing_variable_names + }, + self.get_attrs(), ) # Modified variables must use the same encoding as the store. vars_with_encoding = {} From e0a3e10c97763c0b8953d18e30c7cfb57c5929d8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 28 Mar 2024 09:39:52 -0600 Subject: [PATCH 2/3] Add test --- xarray/tests/test_backends.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fb137977e8..77e8dcc665b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2261,7 +2261,6 @@ def test_write_uneven_dask_chunks(self) -> None: original = create_test_data().chunk({"dim1": 3, "dim2": 4, "dim3": 3}) with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual: for k, v in actual.data_vars.items(): - print(k) assert v.chunks == actual[k].chunks def test_chunk_encoding(self) -> None: @@ -2468,6 +2467,24 @@ def test_group(self) -> None: ) as actual: assert_identical(original, actual) + @requires_zarr + def test_zarr_mode_w_overwrites_encoding(self) -> None: + import zarr + + data = Dataset({"foo": ("x", [1.0, 1.0, 1.0])}) + with self.create_zarr_target() as store: + data.to_zarr( + store, **self.version_kwargs, encoding={"foo": {"add_offset": 1}} + ) + np.testing.assert_equal(zarr.open_group(store)["foo"], data.foo.data - 1) + data.to_zarr( + store, + **self.version_kwargs, + encoding={"foo": {"add_offset": 0}}, + mode="w", + ) + np.testing.assert_equal(zarr.open_group(store)["foo"], data.foo.data) + def test_encoding_kwarg_fixed_width_string(self) -> None: # not relevant for zarr, since we don't use EncodedStringCoder pass From 6d890991e8b4a556ac0f33deb913ebf68f952505 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 28 Mar 2024 10:37:12 -0600 Subject: [PATCH 3/3] fix test --- xarray/tests/test_backends.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e503a517947..248c873d50f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2467,7 +2467,6 @@ def test_group(self) -> None: ) as actual: assert_identical(original, actual) - @requires_zarr def test_zarr_mode_w_overwrites_encoding(self) -> None: import zarr @@ -2476,14 +2475,18 @@ def test_zarr_mode_w_overwrites_encoding(self) -> None: data.to_zarr( store, **self.version_kwargs, encoding={"foo": {"add_offset": 1}} ) - np.testing.assert_equal(zarr.open_group(store)["foo"], data.foo.data - 1) + np.testing.assert_equal( + zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data - 1 + ) data.to_zarr( store, **self.version_kwargs, encoding={"foo": {"add_offset": 0}}, mode="w", ) - np.testing.assert_equal(zarr.open_group(store)["foo"], data.foo.data) + np.testing.assert_equal( + zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data + ) def test_encoding_kwarg_fixed_width_string(self) -> None: # not relevant for zarr, since we don't use EncodedStringCoder