Skip to content

Commit

Permalink
Optimize writes to existing Zarr stores. (#8875)
Browse files Browse the repository at this point in the history
* Optimize writes to existing Zarr stores.

We need to read existing variables to make sure we append or write to a
region with the right encoding. Currently we request all arrays in a
Zarr group. Instead only request those arrays for which we require
encoding information.

* Add test

* fix test
  • Loading branch information
dcherian committed Mar 29, 2024
1 parent 2120808 commit 5bf2cf4
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
7 changes: 6 additions & 1 deletion xarray/backends/zarr.py
Expand Up @@ -623,7 +623,12 @@ def store(
# avoid needing to load index variables into memory.
# TODO: consider making loading indexes lazy again?
existing_vars, _, _ = conventions.decode_cf_variables(
self.get_variables(), self.get_attrs()
{
k: v
for k, v in self.get_variables().items()
if k in existing_variable_names
},
self.get_attrs(),
)
# Modified variables must use the same encoding as the store.
vars_with_encoding = {}
Expand Down
22 changes: 21 additions & 1 deletion xarray/tests/test_backends.py
Expand Up @@ -2261,7 +2261,6 @@ def test_write_uneven_dask_chunks(self) -> None:
original = create_test_data().chunk({"dim1": 3, "dim2": 4, "dim3": 3})
with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual:
for k, v in actual.data_vars.items():
print(k)
assert v.chunks == actual[k].chunks

def test_chunk_encoding(self) -> None:
Expand Down Expand Up @@ -2468,6 +2467,27 @@ def test_group(self) -> None:
) as actual:
assert_identical(original, actual)

def test_zarr_mode_w_overwrites_encoding(self) -> None:
import zarr

data = Dataset({"foo": ("x", [1.0, 1.0, 1.0])})
with self.create_zarr_target() as store:
data.to_zarr(
store, **self.version_kwargs, encoding={"foo": {"add_offset": 1}}
)
np.testing.assert_equal(
zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data - 1
)
data.to_zarr(
store,
**self.version_kwargs,
encoding={"foo": {"add_offset": 0}},
mode="w",
)
np.testing.assert_equal(
zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data
)

def test_encoding_kwarg_fixed_width_string(self) -> None:
# not relevant for zarr, since we don't use EncodedStringCoder
pass
Expand Down

0 comments on commit 5bf2cf4

Please sign in to comment.