Skip to content

Enable taking the mean of dask-backed cftime arrays #6940

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Sep 9, 2022
Merged
4 changes: 2 additions & 2 deletions ci/requirements/min-all-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ dependencies:
- cfgrib=0.9
- cftime=1.4
- coveralls
- dask-core=2021.04
- distributed=2021.04
- dask-core=2021.08.0
- distributed=2021.08.0
- flox=0.5
- h5netcdf=0.11
- h5py=3.1
Expand Down
8 changes: 7 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ v2022.07.0 (unreleased)

New Features
~~~~~~~~~~~~

- Enable taking the mean of dask-backed :py:class:`cftime.datetime` arrays
(:pull:`6556`, :pull:`6940`). By `Deepak Cherian
<https://github.com/dcherian>`_ and `Spencer Clark
<https://github.com/spencerkclark>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -53,6 +56,9 @@ Bug fixes
By `Oliver Lopez <https://github.com/lopezvoliver>`_.
- Fix bug where index variables would be changed inplace (:issue:`6931`, :pull:`6938`)
By `Michael Niklas <https://github.com/headtr1ck>`_.
- Allow taking the mean over non-time dimensions of datasets containing
dask-backed cftime arrays (:issue:`5897`, :pull:`6950`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- Harmonize returned multi-indexed indexes when applying ``concat`` along new dimension (:issue:`6881`, :pull:`6889`)
By `Fabian Hofmann <https://github.com/FabianHofmann>`_.
- Fix step plots with ``hue`` arg. (:pull:`6944`)
Expand Down
12 changes: 4 additions & 8 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,6 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
though some calendars would allow for them (e.g. no_leap). This is because there
is no `cftime.timedelta` object.
"""
# TODO: make this function dask-compatible?
# Set offset to minimum if not given
if offset is None:
if array.dtype.kind in "Mm":
Expand Down Expand Up @@ -531,7 +530,10 @@ def pd_timedelta_to_float(value, datetime_unit):


def _timedelta_to_seconds(array):
return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
if isinstance(array, datetime.timedelta):
return array.total_seconds() * 1e6
else:
return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6


def py_timedelta_to_float(array, datetime_unit):
Expand Down Expand Up @@ -565,12 +567,6 @@ def mean(array, axis=None, skipna=None, **kwargs):
+ offset
)
elif _contains_cftime_datetimes(array):
if is_duck_dask_array(array):
raise NotImplementedError(
"Computing the mean of an array containing "
"cftime.datetime objects is not yet implemented on "
"dask arrays."
)
offset = min(array)
timedeltas = datetime_to_numeric(array, offset, datetime_unit="us")
mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, **kwargs)
Expand Down
63 changes: 49 additions & 14 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,18 +323,51 @@ def test_datetime_mean(dask: bool) -> None:


@requires_cftime
def test_cftime_datetime_mean():
@pytest.mark.parametrize("dask", [False, True])
def test_cftime_datetime_mean(dask):
if dask and not has_dask:
pytest.skip("requires dask")

times = cftime_range("2000", periods=4)
da = DataArray(times, dims=["time"])
da_2d = DataArray(times.values.reshape(2, 2))

assert da.isel(time=0).mean() == da.isel(time=0)
if dask:
da = da.chunk({"time": 2})
da_2d = da_2d.chunk({"dim_0": 2})

expected = da.isel(time=0)
# one compute needed to check the array contains cftime datetimes
with raise_if_dask_computes(max_computes=1):
result = da.isel(time=0).mean()
assert_dask_array(result, dask)
assert_equal(result, expected)

expected = DataArray(times.date_type(2000, 1, 2, 12))
result = da.mean()
with raise_if_dask_computes(max_computes=1):
result = da.mean()
assert_dask_array(result, dask)
assert_equal(result, expected)

da_2d = DataArray(times.values.reshape(2, 2))
result = da_2d.mean()
with raise_if_dask_computes(max_computes=1):
result = da_2d.mean()
assert_dask_array(result, dask)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice! Thanks for making extra sure that we didn't compute the whole thing.

assert_equal(result, expected)


@requires_cftime
@requires_dask
def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data():
# Regression test for part two of GH issue 5897: averaging over a non-time
# dimension still fails if the time variable is dask-backed.
ds = Dataset(
{
"var1": (("time",), cftime_range("2021-10-31", periods=10, freq="D")),
"var2": (("x",), list(range(10))),
}
)
expected = ds.mean("x")
result = ds.chunk({}).mean("x")
assert_equal(result, expected)


Expand Down Expand Up @@ -372,15 +405,6 @@ def test_cftime_datetime_mean_long_time_period():
assert_equal(result, expected)


@requires_cftime
@requires_dask
def test_cftime_datetime_mean_dask_error():
times = cftime_range("2000", periods=4)
da = DataArray(times, dims=["time"]).chunk()
with pytest.raises(NotImplementedError):
da.mean()


def test_empty_axis_dtype():
ds = Dataset()
ds["pos"] = [1, 2, 3]
Expand Down Expand Up @@ -742,6 +766,17 @@ def test_datetime_to_numeric_cftime(dask):
expected = 24 * np.arange(0, 35, 7).astype(dtype)
np.testing.assert_array_equal(result, expected)

with raise_if_dask_computes():
if dask:
time = dask.array.asarray(times[1])
else:
time = np.asarray(times[1])
result = duck_array_ops.datetime_to_numeric(
time, offset=times[0], datetime_unit="h", dtype=int
)
expected = np.array(24 * 7).astype(int)
np.testing.assert_array_equal(result, expected)


@requires_cftime
def test_datetime_to_numeric_potential_overflow():
Expand Down