Skip to content

Restore dask arrays rather than editing encoding #8439

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ v2023.10.1 (19 Oct, 2023)
This release updates our minimum numpy version in ``pyproject.toml`` to 1.22,
consistent with our documentation below.

Bug fixes
~~~~~~~~~

- Fix bug where :py:meth:`Dataset.to_zarr` would modify chunks of datetime-like variables (:issue:`8230`, :pull:`8253`).
By `Mattia Almansi <https://github.com/malmans2>`_.


.. _whats-new.2023.10.0:

v2023.10.0 (19 Oct, 2023)
Expand Down
55 changes: 35 additions & 20 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,34 @@ def __getitem__(self, key):
# could possibly have a work-around for 0d data here


def _squeeze_var_chunks(
var_chunks: tuple[tuple[int, ...], ...], name=None
) -> tuple[int, ...]:
"""
Normalize chunks to tuple of integers.

zarr chunks needs to be uniform for each array
http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks
while dask chunks can be variable sized
http://dask.pydata.org/en/latest/array-design.html#chunks
"""
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks):
raise ValueError(
"Zarr requires uniform chunk sizes except for final chunk. "
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. "
"Consider rechunking using `chunk()`."
)
if any((chunks[0] < chunks[-1]) for chunks in var_chunks):
raise ValueError(
"Final chunk of Zarr array must be the same size or smaller "
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}."
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
# return the first chunk for each dimension
return tuple(chunk[0] for chunk in var_chunks)


def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
"""
Given encoding chunks (possibly None or []) and variable chunks
Expand All @@ -123,26 +151,8 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):

# if there are no chunks in encoding but there are dask chunks, we try to
# use the same chunks in zarr
# However, zarr chunks needs to be uniform for each array
# http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks
# while dask chunks can be variable sized
# http://dask.pydata.org/en/latest/array-design.html#chunks
if var_chunks and not enc_chunks:
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks):
raise ValueError(
"Zarr requires uniform chunk sizes except for final chunk. "
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. "
"Consider rechunking using `chunk()`."
)
if any((chunks[0] < chunks[-1]) for chunks in var_chunks):
raise ValueError(
"Final chunk of Zarr array must be the same size or smaller "
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}."
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
# return the first chunk for each dimension
return tuple(chunk[0] for chunk in var_chunks)
return _squeeze_var_chunks(var_chunks, name=name)

# from here on, we are dealing with user-specified chunks in encoding
# zarr allows chunks to be an integer, in which case it uses the same chunk
Expand Down Expand Up @@ -286,7 +296,8 @@ def extract_zarr_variable_encoding(


# Function below is copied from conventions.encode_cf_variable.
# The only change is to raise an error for object dtypes.
# The only change is to raise an error for object dtypes, and
# add chunks to the encoding when dask arrays are converted to np.
def encode_zarr_variable(var, needs_copy=True, name=None):
"""
Converts an Variable into an Variable which follows some
Expand All @@ -307,6 +318,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
out : Variable
A variable which has been encoded as described above.
"""
original_chunks = var.chunks

var = conventions.encode_cf_variable(var, name=name)

Expand All @@ -317,6 +329,9 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
var = coder.encode(var, name=name)
var = coding.strings.ensure_fixed_length_bytes(var)

if original_chunks and not var.chunks and "chunks" not in var.encoding:
var = var.chunk(original_chunks)
# var.encoding["chunks"] = _squeeze_var_chunks(original_chunks, name=name)
return var


Expand Down
8 changes: 8 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2709,6 +2709,14 @@ def test_attributes(self, obj) -> None:
with pytest.raises(TypeError, match=r"Invalid attribute in Dataset.attrs."):
ds.to_zarr(store_target, **self.version_kwargs)

@requires_dask
def test_chunked_datetime64(self) -> None:
original = create_test_data().astype("datetime64[ns]").chunk(1)
with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual:
for name, actual_var in actual.variables.items():
assert original[name].chunks == actual_var.chunks
assert original.chunks == actual.chunks

def test_vectorized_indexing_negative_step(self) -> None:
if not has_dask:
pytest.xfail(
Expand Down