From 386844ac8e7d441961837f6f1b058993921af33b Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 22 Mar 2024 11:08:44 -0400 Subject: [PATCH 01/15] basic structure for writing a zarr store containing manifests --- virtualizarr/manifests/manifest.py | 6 ++- virtualizarr/xarray.py | 5 +- virtualizarr/zarr.py | 86 ++++++++++++++++++++++++++++-- 3 files changed, 89 insertions(+), 8 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 0e54394..e78ad62 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -3,6 +3,7 @@ from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast import numpy as np +import ujson # type: ignore from pydantic import BaseModel, ConfigDict, field_validator from ..types import ChunkKey @@ -115,7 +116,10 @@ def from_zarr_json(filepath: str) -> "ChunkManifest": def to_zarr_json(self, filepath: str) -> None: """Write a ChunkManifest to a Zarr manifest.json file.""" - raise NotImplementedError() + manifest_dict = self.dict() + + with open(filepath, "w") as json_file: + ujson.dump(manifest_dict, json_file) @classmethod def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index b452078..5adf05f 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -8,6 +8,7 @@ import virtualizarr.kerchunk as kerchunk from virtualizarr.kerchunk import KerchunkStoreRefs from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.zarr import dataset_to_zarr class ManifestBackendArray(ManifestArray, BackendArray): @@ -157,9 +158,7 @@ def to_zarr(self, storepath: str) -> None: ---------- filepath : str, default: None """ - raise NotImplementedError( - "No point in writing out these virtual arrays to Zarr until at least one Zarr reader can actually read them." - ) + dataset_to_zarr(self.ds, storepath) @overload def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs: diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 1ebb1c5..76988aa 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,7 +1,9 @@ +from pathlib import Path from typing import Any, Literal, NewType, Optional, Tuple, Union import numpy as np import ujson # type: ignore +import xarray as xr from pydantic import BaseModel, ConfigDict, field_validator # TODO replace these with classes imported directly from Zarr? (i.e. Zarr Object Models) @@ -82,11 +84,18 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": zarr_format=int(decoded_arr_refs_zarray["zarr_format"]), ) - def to_kerchunk_json(self) -> str: + def dict(self) -> dict[str, Any]: zarray_dict = dict(self) - # TODO not sure if there is a better way to get the ' str: + return ujson.dumps(self.dict()) + + +def encode_dtype(dtype: np.dtype) -> str: + # TODO not sure if there is a better way to get the ' int: @@ -96,3 +105,72 @@ def ceildiv(a: int, b: int) -> int: See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python """ return -(a // -b) + + +def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: + """ + Write an xarray dataset whose variables wrap ManifestArrays to a Zarr store, writing chunk references into manifest.json files. + + Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. + See https://github.com/zarr-developers/zarr-specs/issues/287 + """ + + from virtualizarr.manifests import ManifestArray + + _storepath = Path(storepath) + + # TODO check nothing exists at that path + # TODO do this using pathlib instead + import os + + os.mkdir(_storepath) + + # TODO should techically loop over groups in a tree but a dataset corresponds to only one group + # TODO does this mean we need a group kwarg? + + consolidated_metadata = {"zarr_consolidated_format": 1, "metadata": {}} + + for name, var in ds.variables.items(): + array_dir = _storepath / name + marr = var.data + + # TODO move this check outside the writing loop so we don't write an incomplete store on failure? + if not isinstance(marr, ManifestArray): + raise TypeError( + "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " + f"but variable {name} wraps an array of type {type(marr)}" + ) + + # TODO do this using pathlib instead + os.mkdir(array_dir) + + # write the chunk references into a manifest.json file + marr.manifest.to_zarr_json(array_dir / "manifest.json") + + # write each .zarray + with open(array_dir / ".zarray", "w") as json_file: + ujson.dump(marr.zarray.dict(), json_file) + + # write each .zattrs + zattrs = var.attrs.copy() + zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) + with open(array_dir / ".zattrs", "w") as json_file: + ujson.dump(zattrs, json_file) + + # record this info to include in the overall .zmetadata + consolidated_metadata["metadata"][name + ".zarray"] = marr.zarray.dict() + consolidated_metadata["metadata"][name + ".zattrs"] = zattrs + + # write top-level .zattrs + with open(_storepath / ".zattrs", "w") as json_file: + ujson.dump(ds.attrs, json_file) + + # write .zgroup + with open(_storepath / ".zgroup", "w") as json_file: + ujson.dump({"zarr_format": 2}, json_file) + + # write store-level .zmetadata + consolidated_metadata[".zgroup"] = {"zarr_format": 2} + consolidated_metadata[".zattrs"] = ds.attrs + with open(_storepath / ".zmetadata", "w") as json_file: + ujson.dump(consolidated_metadata, json_file) From 02e457e1f2b09a3b3618e487953a03b7780917fa Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 22 Mar 2024 11:43:02 -0400 Subject: [PATCH 02/15] write in nicely indented form --- virtualizarr/manifests/manifest.py | 6 ++---- virtualizarr/zarr.py | 34 ++++++++++++++++-------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index e78ad62..e359695 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,9 +1,9 @@ import itertools +import json import re from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast import numpy as np -import ujson # type: ignore from pydantic import BaseModel, ConfigDict, field_validator from ..types import ChunkKey @@ -116,10 +116,8 @@ def from_zarr_json(filepath: str) -> "ChunkManifest": def to_zarr_json(self, filepath: str) -> None: """Write a ChunkManifest to a Zarr manifest.json file.""" - manifest_dict = self.dict() - with open(filepath, "w") as json_file: - ujson.dump(manifest_dict, json_file) + json.dump(self.dict(), json_file, indent=4, separators=(", ", ": ")) @classmethod def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 76988aa..e9b96c7 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import Any, Literal, NewType, Optional, Tuple, Union @@ -128,7 +129,17 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: # TODO should techically loop over groups in a tree but a dataset corresponds to only one group # TODO does this mean we need a group kwarg? - consolidated_metadata = {"zarr_consolidated_format": 1, "metadata": {}} + consolidated_metadata: dict = {"metadata": {}} + + # write top-level .zattrs + with open(_storepath / ".zattrs", "w") as json_file: + json.dump(ds.attrs, json_file, indent=4, separators=(", ", ": ")) + consolidated_metadata[".zattrs"] = ds.attrs + + # write .zgroup + with open(_storepath / ".zgroup", "w") as json_file: + json.dump({"zarr_format": 2}, json_file, indent=4, separators=(", ", ": ")) + consolidated_metadata[".zgroup"] = {"zarr_format": 2} for name, var in ds.variables.items(): array_dir = _storepath / name @@ -149,28 +160,19 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: # write each .zarray with open(array_dir / ".zarray", "w") as json_file: - ujson.dump(marr.zarray.dict(), json_file) + json.dump(marr.zarray.dict(), json_file, indent=4, separators=(", ", ": ")) # write each .zattrs zattrs = var.attrs.copy() zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) with open(array_dir / ".zattrs", "w") as json_file: - ujson.dump(zattrs, json_file) + json.dump(zattrs, json_file, indent=4, separators=(", ", ": ")) # record this info to include in the overall .zmetadata - consolidated_metadata["metadata"][name + ".zarray"] = marr.zarray.dict() - consolidated_metadata["metadata"][name + ".zattrs"] = zattrs - - # write top-level .zattrs - with open(_storepath / ".zattrs", "w") as json_file: - ujson.dump(ds.attrs, json_file) - - # write .zgroup - with open(_storepath / ".zgroup", "w") as json_file: - ujson.dump({"zarr_format": 2}, json_file) + consolidated_metadata["metadata"][name + "/.zarray"] = marr.zarray.dict() + consolidated_metadata["metadata"][name + "/.zattrs"] = zattrs # write store-level .zmetadata - consolidated_metadata[".zgroup"] = {"zarr_format": 2} - consolidated_metadata[".zattrs"] = ds.attrs + consolidated_metadata["zarr_consolidated_format"] = 1 with open(_storepath / ".zmetadata", "w") as json_file: - ujson.dump(consolidated_metadata, json_file) + json.dump(consolidated_metadata, json_file, indent=4, separators=(", ", ": ")) From 43872ab06c15a60c14aeb3162724325b4b160510 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 22 Mar 2024 11:45:57 -0400 Subject: [PATCH 03/15] use pathlib for everything --- virtualizarr/zarr.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index e9b96c7..cdae956 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -119,12 +119,7 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: from virtualizarr.manifests import ManifestArray _storepath = Path(storepath) - - # TODO check nothing exists at that path - # TODO do this using pathlib instead - import os - - os.mkdir(_storepath) + Path.mkdir(_storepath, exist_ok=False) # TODO should techically loop over groups in a tree but a dataset corresponds to only one group # TODO does this mean we need a group kwarg? @@ -152,8 +147,7 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: f"but variable {name} wraps an array of type {type(marr)}" ) - # TODO do this using pathlib instead - os.mkdir(array_dir) + Path.mkdir(array_dir, exist_ok=False) # write the chunk references into a manifest.json file marr.manifest.to_zarr_json(array_dir / "manifest.json") From c319d305119f3b2bb40d3fb89c47d39a0797e995 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Mar 2024 14:29:47 -0400 Subject: [PATCH 04/15] documentation --- docs/usage.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 4fcd779..995e0ed 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -295,4 +295,23 @@ combined_ds = xr.open_dataset(mapper, engine="kerchunk") ### Writing as Zarr -TODO: Explanation of how this requires changes in zarr upstream to be able to read it +Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the `ds.virtualize.to_zarr ` accessor method. + +```python +combined_vds.virtualize.to_zarr('combined.zarr') +``` + +The result is a zarr v3 store on disk which contains the chunk manifest information written out as `manifest.json` files, so the store looks like this: + +``` +combined/zarr.json <- group metadata +combined/air/zarr.json <- array metadata +combined/air/manifest.json <- array manifest +... +``` + +The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`. + +```{note} +Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this is only of theoretical interest. +``` \ No newline at end of file From 8f0ee51eeb8e5a7016ec8c479a8c63bbdd48ef47 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 28 Mar 2024 22:07:20 -0400 Subject: [PATCH 05/15] docstrings --- docs/usage.md | 6 +++--- virtualizarr/xarray.py | 7 ++++++- virtualizarr/zarr.py | 9 ++++++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 995e0ed..2d86c4b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -306,12 +306,12 @@ The result is a zarr v3 store on disk which contains the chunk manifest informat ``` combined/zarr.json <- group metadata combined/air/zarr.json <- array metadata -combined/air/manifest.json <- array manifest +combined/air/manifest.json <- array manifest ... ``` The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`. ```{note} -Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this is only of theoretical interest. -``` \ No newline at end of file +Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this is only of theoretical interest. +``` diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index a7f3283..f10f23d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -159,9 +159,14 @@ def to_zarr(self, storepath: str) -> None: """ Write out all virtualized arrays as a new Zarr store on disk. + Currently requires all variables to be backed by ManifestArray objects. + + Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. + See https://github.com/zarr-developers/zarr-specs/issues/287 + Parameters ---------- - filepath : str, default: None + storepath: str """ dataset_to_zarr(self.ds, storepath) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index cdae956..ec1d590 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -110,10 +110,17 @@ def ceildiv(a: int, b: int) -> int: def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: """ - Write an xarray dataset whose variables wrap ManifestArrays to a Zarr store, writing chunk references into manifest.json files. + Write an xarray dataset whose variables wrap ManifestArrays to a v3 Zarr store, writing chunk references into manifest.json files. + + Currently requires all variables to be backed by ManifestArray objects. Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. See https://github.com/zarr-developers/zarr-specs/issues/287 + + Parameters + ---------- + ds: xr.Dataset + storepath: str """ from virtualizarr.manifests import ManifestArray From c8add611df051af80bd69f69f65a48cb6b7620b5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 30 Mar 2024 12:16:27 -0400 Subject: [PATCH 06/15] vendor zarr.utils.json_dumps --- virtualizarr/vendor/__init__.py | 0 virtualizarr/vendor/zarr/LICENSE.txt | 21 +++++++++++++++++++++ virtualizarr/vendor/zarr/__init__.py | 0 virtualizarr/vendor/zarr/utils.py | 22 ++++++++++++++++++++++ virtualizarr/zarr.py | 23 ++++++++++++----------- 5 files changed, 55 insertions(+), 11 deletions(-) create mode 100644 virtualizarr/vendor/__init__.py create mode 100644 virtualizarr/vendor/zarr/LICENSE.txt create mode 100644 virtualizarr/vendor/zarr/__init__.py create mode 100644 virtualizarr/vendor/zarr/utils.py diff --git a/virtualizarr/vendor/__init__.py b/virtualizarr/vendor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/virtualizarr/vendor/zarr/LICENSE.txt b/virtualizarr/vendor/zarr/LICENSE.txt new file mode 100644 index 0000000..0b21ebc --- /dev/null +++ b/virtualizarr/vendor/zarr/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015-2024 Zarr Developers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/virtualizarr/vendor/zarr/__init__.py b/virtualizarr/vendor/zarr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/virtualizarr/vendor/zarr/utils.py b/virtualizarr/vendor/zarr/utils.py new file mode 100644 index 0000000..918d6b5 --- /dev/null +++ b/virtualizarr/vendor/zarr/utils.py @@ -0,0 +1,22 @@ +import json +import numbers + +from typing import Any + + +class NumberEncoder(json.JSONEncoder): + def default(self, o): + # See json.JSONEncoder.default docstring for explanation + # This is necessary to encode numpy dtype + if isinstance(o, numbers.Integral): + return int(o) + if isinstance(o, numbers.Real): + return float(o) + return json.JSONEncoder.default(self, o) + + +def json_dumps(o: Any) -> bytes: + """Write JSON in a consistent, human-readable way.""" + return json.dumps( + o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder + ).encode("ascii") diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 3659fc9..8a7294c 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,5 +1,4 @@ -import json from pathlib import Path from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict @@ -125,6 +124,7 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: """ from virtualizarr.manifests import ManifestArray + from virtualizarr.vendor.zarr.utils import json_dumps _storepath = Path(storepath) Path.mkdir(_storepath, exist_ok=False) @@ -135,13 +135,13 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: consolidated_metadata: dict = {"metadata": {}} # write top-level .zattrs - with open(_storepath / ".zattrs", "w") as json_file: - json.dump(ds.attrs, json_file, indent=4, separators=(", ", ": ")) + with open(_storepath / ".zattrs", "wb") as zattrs_file: + zattrs_file.write(json_dumps(ds.attrs)) consolidated_metadata[".zattrs"] = ds.attrs # write .zgroup - with open(_storepath / ".zgroup", "w") as json_file: - json.dump({"zarr_format": 2}, json_file, indent=4, separators=(", ", ": ")) + with open(_storepath / ".zgroup", "wb") as zgroup_file: + zgroup_file.write(json_dumps({"zarr_format": 2})) consolidated_metadata[".zgroup"] = {"zarr_format": 2} for name, var in ds.variables.items(): @@ -149,6 +149,7 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: marr = var.data # TODO move this check outside the writing loop so we don't write an incomplete store on failure? + # TODO at some point this should be generalized to also write in-memory arrays as normal zarr chunks, see GH isse #62. if not isinstance(marr, ManifestArray): raise TypeError( "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " @@ -161,14 +162,14 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: marr.manifest.to_zarr_json(array_dir / "manifest.json") # write each .zarray - with open(array_dir / ".zarray", "w") as json_file: - json.dump(marr.zarray.dict(), json_file, indent=4, separators=(", ", ": ")) + with open(_storepath / ".zgroup", "wb") as zarray_file: + zarray_file.write(json_dumps(marr.zarray.dict())) # write each .zattrs zattrs = var.attrs.copy() zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) - with open(array_dir / ".zattrs", "w") as json_file: - json.dump(zattrs, json_file, indent=4, separators=(", ", ": ")) + with open(_storepath / ".zattrs", "wb") as zattrs_file: + zattrs_file.write(json_dumps(zattrs)) # record this info to include in the overall .zmetadata consolidated_metadata["metadata"][name + "/.zarray"] = marr.zarray.dict() @@ -176,5 +177,5 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: # write store-level .zmetadata consolidated_metadata["zarr_consolidated_format"] = 1 - with open(_storepath / ".zmetadata", "w") as json_file: - json.dump(consolidated_metadata, json_file, indent=4, separators=(", ", ": ")) + with open(_storepath / ".zmetadata", "wb") as zmetadata_file: + zmetadata_file.write(json_dumps(consolidated_metadata)) From 42e17d1e51e8c1dd7b40f013553e501ce0bfd521 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 30 Mar 2024 12:17:38 -0400 Subject: [PATCH 07/15] remove consolidated metadata, as v3 doesn't have this yet --- virtualizarr/zarr.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 8a7294c..1770be9 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -132,18 +132,14 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: # TODO should techically loop over groups in a tree but a dataset corresponds to only one group # TODO does this mean we need a group kwarg? - consolidated_metadata: dict = {"metadata": {}} - # write top-level .zattrs with open(_storepath / ".zattrs", "wb") as zattrs_file: zattrs_file.write(json_dumps(ds.attrs)) - consolidated_metadata[".zattrs"] = ds.attrs - + # write .zgroup with open(_storepath / ".zgroup", "wb") as zgroup_file: zgroup_file.write(json_dumps({"zarr_format": 2})) - consolidated_metadata[".zgroup"] = {"zarr_format": 2} - + for name, var in ds.variables.items(): array_dir = _storepath / name marr = var.data @@ -170,12 +166,3 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) with open(_storepath / ".zattrs", "wb") as zattrs_file: zattrs_file.write(json_dumps(zattrs)) - - # record this info to include in the overall .zmetadata - consolidated_metadata["metadata"][name + "/.zarray"] = marr.zarray.dict() - consolidated_metadata["metadata"][name + "/.zattrs"] = zattrs - - # write store-level .zmetadata - consolidated_metadata["zarr_consolidated_format"] = 1 - with open(_storepath / ".zmetadata", "wb") as zmetadata_file: - zmetadata_file.write(json_dumps(consolidated_metadata)) From 23772b94bffa07a1e3fa2a8fc634bee23cab9bea Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 30 Mar 2024 14:51:05 -0400 Subject: [PATCH 08/15] license for vendoring part of zarr-python --- virtualizarr/vendor/zarr/LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/vendor/zarr/LICENSE.txt b/virtualizarr/vendor/zarr/LICENSE.txt index 0b21ebc..a4de1c3 100644 --- a/virtualizarr/vendor/zarr/LICENSE.txt +++ b/virtualizarr/vendor/zarr/LICENSE.txt @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. From 4f2655fdb436f2233bddf075ee8083e774b062c5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 30 Mar 2024 14:51:31 -0400 Subject: [PATCH 09/15] change to write v3 --- virtualizarr/zarr.py | 97 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 75 insertions(+), 22 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 1770be9..fcb6aa6 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,11 +1,15 @@ from pathlib import Path -from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict +from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict, TYPE_CHECKING import numpy as np import ujson # type: ignore import xarray as xr from pydantic import BaseModel, ConfigDict, field_validator +from virtualizarr.vendor.zarr.utils import json_dumps + +if TYPE_CHECKING: + pass # TODO replace these with classes imported directly from Zarr? (i.e. Zarr Object Models) ZAttrs = NewType( @@ -124,22 +128,19 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: """ from virtualizarr.manifests import ManifestArray - from virtualizarr.vendor.zarr.utils import json_dumps _storepath = Path(storepath) Path.mkdir(_storepath, exist_ok=False) - # TODO should techically loop over groups in a tree but a dataset corresponds to only one group - # TODO does this mean we need a group kwarg? - - # write top-level .zattrs - with open(_storepath / ".zattrs", "wb") as zattrs_file: - zattrs_file.write(json_dumps(ds.attrs)) - - # write .zgroup - with open(_storepath / ".zgroup", "wb") as zgroup_file: - zgroup_file.write(json_dumps({"zarr_format": 2})) - + # should techically loop over groups in a tree but a dataset corresponds to only one group + group_metadata = { + "zarr_format": 3, + "node_type": "group", + "attributes": ds.attrs + } + with open(_storepath / 'zarr.json', "wb") as group_metadata_file: + group_metadata_file.write(json_dumps(group_metadata)) + for name, var in ds.variables.items(): array_dir = _storepath / name marr = var.data @@ -155,14 +156,66 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: Path.mkdir(array_dir, exist_ok=False) # write the chunk references into a manifest.json file - marr.manifest.to_zarr_json(array_dir / "manifest.json") + # and the array metadata into a zarr.json file + to_zarr_json(var, array_dir) + + +def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: + """ + Write out both the zarr.json and manifest.json file into the given zarr array directory. - # write each .zarray - with open(_storepath / ".zgroup", "wb") as zarray_file: - zarray_file.write(json_dumps(marr.zarray.dict())) + Follows the Zarr v3 manifest storage transformer ZEP (see https://github.com/zarr-developers/zarr-specs/issues/287). + + Parameters + ---------- + var : xr.Variable + Must be wrapping a ManifestArray + dirpath : str + Zarr store array directory into which to write files. + """ - # write each .zattrs - zattrs = var.attrs.copy() - zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) - with open(_storepath / ".zattrs", "wb") as zattrs_file: - zattrs_file.write(json_dumps(zattrs)) + marr = var.data + + marr.manifest.to_zarr_json(array_dir / 'manifest.json') + + metadata = zarr_v3_array_metadata(marr.zarray, list(var.dims), var.attrs) + with open(array_dir / 'zarr.json', "wb") as metadata_file: + metadata_file.write(json_dumps(metadata)) + + +def zarr_v3_array_metadata(zarray: ZArray, dim_names: List[str], attrs: dict) -> dict: + """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" + # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us + + metadata = zarray.dict() + + # adjust to match v3 spec + metadata["zarr_format"] = 3 + metadata["node_type"] = "array" + metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) + metadata["chunk_grid"] = {"name": "regular", "configuration": {"chunk_shape": metadata.pop("chunks")}} + metadata["chunk_key_encoding"] = { + "name": "default", + "configuration": { + "separator": "/" + } + } + metadata["codecs"] = metadata.pop("filters") + metadata.pop("compressor") # TODO this should be entered in codecs somehow + metadata.pop("order") # TODO this should be replaced by a transpose codec + + # indicate that we're using the manifest storage transformer ZEP + metadata["storage_transformers"] = [ + { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json" + } + } + ] + + # add information from xarray object + metadata["dimension_names"] = dim_names + metadata["attributes"] = attrs + + return metadata From b4c38fed7dae2ddfb6c23dfb118c9f6b1d3a33ea Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 30 Mar 2024 22:25:32 -0400 Subject: [PATCH 10/15] implement reading from v3-compliant stores --- docs/usage.md | 6 +- virtualizarr/manifests/manifest.py | 10 ++- virtualizarr/xarray.py | 103 +++++++++++++++++++++-------- virtualizarr/zarr.py | 38 +++++++++++ 4 files changed, 126 insertions(+), 31 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index b3d0383..ed029d9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -304,7 +304,7 @@ combined_ds = xr.open_dataset(mapper, engine="kerchunk") ### Writing as Zarr -Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the `ds.virtualize.to_zarr ` accessor method. +Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr ` accessor method. ```python combined_vds.virtualize.to_zarr('combined.zarr') @@ -322,5 +322,7 @@ combined/air/manifest.json <- array manifest The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`. ```{note} -Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this is only of theoretical interest. +Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this feature cannot be used for data processing. + +This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`. ``` diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 53dfc5e..b12813e 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -111,10 +111,14 @@ def dict(self) -> dict[str, dict[str, Union[str, int]]]: """Converts the entire manifest to a nested dictionary.""" return {k: dict(entry) for k, entry in self.entries.items()} - @staticmethod - def from_zarr_json(filepath: str) -> "ChunkManifest": + @classmethod + def from_zarr_json(cls, filepath: str) -> "ChunkManifest": """Create a ChunkManifest from a Zarr manifest.json file.""" - raise NotImplementedError() + with open(filepath, "r") as manifest_file: + entries_dict = json.load(manifest_file) + + entries = {cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()} + return cls(entries=entries) def to_zarr_json(self, filepath: str) -> None: """Write a ChunkManifest to a Zarr manifest.json file.""" diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index ce0f982..91f7c61 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -1,4 +1,5 @@ from typing import List, Literal, Mapping, Optional, Union, overload +from pathlib import Path import ujson # type: ignore import xarray as xr @@ -9,8 +10,7 @@ import virtualizarr.kerchunk as kerchunk from virtualizarr.kerchunk import KerchunkStoreRefs from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.zarr import dataset_to_zarr - +from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json class ManifestBackendArray(ManifestArray, BackendArray): """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" @@ -38,7 +38,7 @@ def open_virtual_dataset( File path to open as a set of virtualized zarr arrays. filetype : str, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4'}. + Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. drop_variables: list[str], default is None Variables in the file to drop before returning. @@ -51,37 +51,88 @@ def open_virtual_dataset( Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. """ - # this is the only place we actually always need to use kerchunk directly - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) + if drop_variables is None: + drop_variables = [] + + if virtual_array_class is not ManifestArray: + raise NotImplementedError() + + if filetype == "zarr_v3": + # TODO is there a neat way of auto-detecting this? + return open_virtual_dataset_from_v3_store(storepath=filepath, drop_variables=drop_variables, indexes=indexes) + else: + # this is the only place we actually always need to use kerchunk directly + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + + if indexes is None: + # add default indexes by reading data from file + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + ds = xr.open_dataset(filepath) + indexes = ds.xindexes + ds.close() + + vds = dataset_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables, + virtual_array_class=virtual_array_class, + indexes=indexes, + ) + + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + + return vds + + +def open_virtual_dataset_from_v3_store( + storepath: str, + drop_variables: List[str], + indexes: Optional[Mapping[str, Index]], +) -> xr.Dataset: + """ + Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays. + """ + _storepath = Path(storepath) + + ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") + + # TODO recursive glob to create a datatree + vars = {} + for array_dir in _storepath.glob("*/"): + var_name = array_dir.name + if var_name in drop_variables: + break + + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") + manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + + marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) + var = xr.Variable(data=marr, dims=dim_names, attrs=attrs) + vars[var_name] = var if indexes is None: - # add default indexes by reading data from file - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - ds = xr.open_dataset(filepath) - indexes = ds.xindexes - ds.close() - - vds = dataset_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables, - virtual_array_class=virtual_array_class, - indexes=indexes, - ) + raise NotImplementedError() - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + data_vars, coords = separate_coords(vars, indexes) + + ds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - return vds + return ds def dataset_from_kerchunk_refs( refs: KerchunkStoreRefs, - drop_variables: Optional[List[str]] = None, - virtual_array_class=ManifestArray, - indexes={}, + drop_variables: List[str] = [], + virtual_array_class: type = ManifestArray, + indexes: Mapping[str, Index] = None, ) -> xr.Dataset: """ Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index fcb6aa6..5477413 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict, TYPE_CHECKING +import json import numpy as np import ujson # type: ignore @@ -219,3 +220,40 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: List[str], attrs: dict) -> metadata["attributes"] = attrs return metadata + + +def attrs_from_zarr_group_json(filepath: Path) -> dict: + with open(filepath, "r") as metadata_file: + attrs = json.load(metadata_file) + return attrs["attributes"] + + +def metadata_from_zarr_json(filepath: Path) -> Tuple[ZArray, List[str], dict]: + with open(filepath, "r") as metadata_file: + metadata = json.load(metadata_file) + + if { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + } + } not in metadata.get("storage_transformers", []): + raise ValueError("Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP.") + + attrs = metadata.pop("attributes") + dim_names = metadata.pop("dimension_names") + + chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] + + zarray = ZArray( + chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], + compressor=metadata["codecs"], + dtype=np.dtype(metadata["data_type"]), + fill_value=metadata["fill_value"], + filters=metadata.get("filters", None), + order="C", + shape=chunk_shape, + zarr_format=3, + ) + + return zarray, dim_names, attrs From 79f39e11e59653a587b1d7d1cd1631028812d2b0 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 30 Mar 2024 22:43:23 -0400 Subject: [PATCH 11/15] roundtripping test --- virtualizarr/xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 91f7c61..7af690d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -132,7 +132,7 @@ def dataset_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: List[str] = [], virtual_array_class: type = ManifestArray, - indexes: Mapping[str, Index] = None, + indexes: Optional[Mapping[str, Index]] = None, ) -> xr.Dataset: """ Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. From 98a259edab8e8e912aa2f6496dd59cddee6b3bf5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 1 Apr 2024 16:11:33 -0400 Subject: [PATCH 12/15] forgot to add the file with the test --- virtualizarr/tests/test_zarr.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 virtualizarr/tests/test_zarr.py diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py new file mode 100644 index 0000000..2ac31ce --- /dev/null +++ b/virtualizarr/tests/test_zarr.py @@ -0,0 +1,27 @@ +import xarray as xr +import numpy as np +import xarray.testing as xrt +from virtualizarr import open_virtual_dataset, ManifestArray +from virtualizarr.manifests.manifest import ChunkEntry + + +def test_zarr_v3_roundtrip(tmpdir): + arr = ManifestArray( + chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)}, + zarray=dict( + shape=(2, 3), + dtype=np.dtype(" Date: Tue, 9 Apr 2024 18:37:27 -0400 Subject: [PATCH 13/15] test dataset-level attributes --- virtualizarr/tests/test_zarr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 2ac31ce..b8e3704 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -19,9 +19,11 @@ def test_zarr_v3_roundtrip(tmpdir): zarr_format=3, ), ) - original = xr.Dataset({"a": (["x", "y"], arr)}) + original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0}) original.virtualize.to_zarr(tmpdir / "store.zarr") roundtrip = open_virtual_dataset(tmpdir / "store.zarr", filetype="zarr_v3", indexes={}) xrt.assert_identical(roundtrip, original) + + assert False From 0151652f9948e8585cbacc55486ae40a9ee85981 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 9 Apr 2024 18:37:40 -0400 Subject: [PATCH 14/15] debugging print --- virtualizarr/xarray.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 9703bb0..b3c26cb 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -158,6 +158,8 @@ def open_virtual_dataset_from_v3_store( if var_name in drop_variables: break + print(array_dir) + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) From 6ba41de5cf4da93befd1b51a91deb7023c65d4b7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 9 Apr 2024 18:50:08 -0400 Subject: [PATCH 15/15] try explicitly separating files from directories --- virtualizarr/tests/test_zarr.py | 2 -- virtualizarr/xarray.py | 9 ++++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index b8e3704..2faf43c 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -25,5 +25,3 @@ def test_zarr_v3_roundtrip(tmpdir): roundtrip = open_virtual_dataset(tmpdir / "store.zarr", filetype="zarr_v3", indexes={}) xrt.assert_identical(roundtrip, original) - - assert False diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index b3c26cb..1fa8357 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -152,14 +152,17 @@ def open_virtual_dataset_from_v3_store( ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") # TODO recursive glob to create a datatree + # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it + # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 + all_paths = _storepath.glob("*/") + directory_paths = [p for p in all_paths if not p.is_file()] + vars = {} - for array_dir in _storepath.glob("*/"): + for array_dir in directory_paths: var_name = array_dir.name if var_name in drop_variables: break - print(array_dir) - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json"))