From 5bf2cf4283e6ed8d23b146e2379302a7077a0097 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 29 Mar 2024 08:35:28 -0600 Subject: [PATCH] Optimize writes to existing Zarr stores. (#8875) * Optimize writes to existing Zarr stores. We need to read existing variables to make sure we append or write to a region with the right encoding. Currently we request all arrays in a Zarr group. Instead only request those arrays for which we require encoding information. * Add test * fix test --- xarray/backends/zarr.py | 7 ++++++- xarray/tests/test_backends.py | 22 +++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 13b1819f206..b4369fa728d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -623,7 +623,12 @@ def store( # avoid needing to load index variables into memory. # TODO: consider making loading indexes lazy again? existing_vars, _, _ = conventions.decode_cf_variables( - self.get_variables(), self.get_attrs() + { + k: v + for k, v in self.get_variables().items() + if k in existing_variable_names + }, + self.get_attrs(), ) # Modified variables must use the same encoding as the store. vars_with_encoding = {} diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 07573066568..248c873d50f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2261,7 +2261,6 @@ def test_write_uneven_dask_chunks(self) -> None: original = create_test_data().chunk({"dim1": 3, "dim2": 4, "dim3": 3}) with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual: for k, v in actual.data_vars.items(): - print(k) assert v.chunks == actual[k].chunks def test_chunk_encoding(self) -> None: @@ -2468,6 +2467,27 @@ def test_group(self) -> None: ) as actual: assert_identical(original, actual) + def test_zarr_mode_w_overwrites_encoding(self) -> None: + import zarr + + data = Dataset({"foo": ("x", [1.0, 1.0, 1.0])}) + with self.create_zarr_target() as store: + data.to_zarr( + store, **self.version_kwargs, encoding={"foo": {"add_offset": 1}} + ) + np.testing.assert_equal( + zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data - 1 + ) + data.to_zarr( + store, + **self.version_kwargs, + encoding={"foo": {"add_offset": 0}}, + mode="w", + ) + np.testing.assert_equal( + zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data + ) + def test_encoding_kwarg_fixed_width_string(self) -> None: # not relevant for zarr, since we don't use EncodedStringCoder pass