diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c1bfaba8756..25434f6ad08 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,8 @@ v2024.03.0 (unreleased) New Features ~~~~~~~~~~~~ +- Grouped and resampling quantile calculations now use the vectorized algorithm in ``flox>=0.9.4`` if present. + By `Deepak Cherian `_. - Do not broadcast in arithmetic operations when global option ``arithmetic_broadcast=False`` (:issue:`6806`, :pull:`8784`). By `Etienne Schalk `_ and `Deepak Cherian `_. diff --git a/xarray/core/_aggregations.py b/xarray/core/_aggregations.py index bee6afd5a19..96f860b3209 100644 --- a/xarray/core/_aggregations.py +++ b/xarray/core/_aggregations.py @@ -2392,8 +2392,6 @@ def count( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -2490,8 +2488,6 @@ def all( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -2588,8 +2584,6 @@ def any( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -2692,8 +2686,6 @@ def max( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -2808,8 +2800,6 @@ def min( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -2924,8 +2914,6 @@ def mean( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3049,8 +3037,6 @@ def prod( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3186,8 +3172,6 @@ def sum( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3320,8 +3304,6 @@ def std( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3454,8 +3436,6 @@ def var( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3584,8 +3564,6 @@ def median( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3687,8 +3665,6 @@ def cumsum( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3788,8 +3764,6 @@ def cumprod( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -3919,8 +3893,6 @@ def count( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -4017,8 +3989,6 @@ def all( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -4115,8 +4085,6 @@ def any( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -4219,8 +4187,6 @@ def max( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -4335,8 +4301,6 @@ def min( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -4451,8 +4415,6 @@ def mean( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -4576,8 +4538,6 @@ def prod( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -4713,8 +4673,6 @@ def sum( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -4847,8 +4805,6 @@ def std( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -4981,8 +4937,6 @@ def var( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -5111,8 +5065,6 @@ def median( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -5214,8 +5166,6 @@ def cumsum( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -5315,8 +5265,6 @@ def cumprod( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -5446,8 +5394,6 @@ def count( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -5537,8 +5483,6 @@ def all( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -5628,8 +5572,6 @@ def any( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -5725,8 +5667,6 @@ def max( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -5832,8 +5772,6 @@ def min( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Examples @@ -5939,8 +5877,6 @@ def mean( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6055,8 +5991,6 @@ def prod( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6181,8 +6115,6 @@ def sum( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6304,8 +6236,6 @@ def std( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6427,8 +6357,6 @@ def var( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6546,8 +6474,6 @@ def median( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6641,8 +6567,6 @@ def cumsum( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6738,8 +6662,6 @@ def cumprod( Use the ``flox`` package to significantly speed up groupby computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - other methods might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -6865,8 +6787,6 @@ def count( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -6956,8 +6876,6 @@ def all( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -7047,8 +6965,6 @@ def any( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -7144,8 +7060,6 @@ def max( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -7251,8 +7165,6 @@ def min( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Examples @@ -7358,8 +7270,6 @@ def mean( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -7474,8 +7384,6 @@ def prod( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -7600,8 +7508,6 @@ def sum( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -7723,8 +7629,6 @@ def std( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -7846,8 +7750,6 @@ def var( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -7965,8 +7867,6 @@ def median( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -8060,8 +7960,6 @@ def cumsum( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. @@ -8157,8 +8055,6 @@ def cumprod( Use the ``flox`` package to significantly speed up resampling computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. - The default choice is ``method="cohorts"`` which generalizes the best, - ``method="blockwise"`` might work better for your problem. See the `flox documentation `_ for more. Non-numeric variables will be removed prior to reducing. diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 3fbfb74d985..5966c32df92 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -28,7 +28,7 @@ filter_indexes_from_coords, safe_cast_to_index, ) -from xarray.core.options import _get_keep_attrs +from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.types import ( Dims, QuantileMethods, @@ -38,11 +38,13 @@ ) from xarray.core.utils import ( FrozenMappingWarningOnValuesAccess, + contains_only_chunked_or_numpy, either_dict_or_kwargs, emit_user_level_warning, hashable, is_scalar, maybe_wrap_array, + module_available, peek_at, ) from xarray.core.variable import IndexVariable, Variable @@ -1075,6 +1077,9 @@ def _binary_op(self, other, f, reflexive=False): result[var] = result[var].transpose(d, ...) return result + def _restore_dim_order(self, stacked): + raise NotImplementedError + def _maybe_restore_empty_groups(self, combined): """Our index contained empty groups (e.g., from a resampling or binning). If we reduced on that dimension, we want to restore the full index. @@ -1209,13 +1214,9 @@ def _flox_reduce( (result.sizes[grouper.name],) + var.shape, ) - if isbin: - # Fix dimension order when binning a dimension coordinate - # Needed as long as we do a separate code path for pint; - # For some reason Datasets and DataArrays behave differently! - (group_dim,) = grouper.dims - if isinstance(self._obj, Dataset) and group_dim in self._obj.dims: - result = result.transpose(grouper.name, ...) + if not isinstance(result, Dataset): + # only restore dimension order for arrays + result = self._restore_dim_order(result) return result @@ -1376,16 +1377,30 @@ def quantile( (grouper,) = self.groupers dim = grouper.group1d.dims - return self.map( - self._obj.__class__.quantile, - shortcut=False, - q=q, - dim=dim, - method=method, - keep_attrs=keep_attrs, - skipna=skipna, - interpolation=interpolation, - ) + # Dataset.quantile does this, do it for flox to ensure same output. + q = np.asarray(q, dtype=np.float64) + + if ( + method == "linear" + and OPTIONS["use_flox"] + and contains_only_chunked_or_numpy(self._obj) + and module_available("flox", minversion="0.9.4") + ): + result = self._flox_reduce( + func="quantile", q=q, dim=dim, keep_attrs=keep_attrs, skipna=skipna + ) + return result + else: + return self.map( + self._obj.__class__.quantile, + shortcut=False, + q=q, + dim=dim, + method=method, + keep_attrs=keep_attrs, + skipna=skipna, + interpolation=interpolation, + ) def where(self, cond, other=dtypes.NA) -> T_Xarray: """Return elements from `self` or `other` depending on `cond`. diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index d927550e424..045e1223b7d 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -245,6 +245,51 @@ def test_da_groupby_empty() -> None: empty_array.groupby("dim") +@requires_dask +def test_dask_da_groupby_quantile() -> None: + # Only works when the grouped reduction can run blockwise + # Scalar quantile + expected = xr.DataArray( + data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x" + ) + array = xr.DataArray( + data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x" + ) + with pytest.raises(ValueError): + array.chunk(x=1).groupby("x").quantile(0.5) + + # will work blockwise with flox + actual = array.chunk(x=3).groupby("x").quantile(0.5) + assert_identical(expected, actual) + + # will work blockwise with flox + actual = array.chunk(x=-1).groupby("x").quantile(0.5) + assert_identical(expected, actual) + + +@requires_dask +def test_dask_da_groupby_median() -> None: + expected = xr.DataArray(data=[2, 5], coords={"x": [1, 2]}, dims="x") + array = xr.DataArray( + data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x" + ) + with xr.set_options(use_flox=False): + actual = array.chunk(x=1).groupby("x").median() + assert_identical(expected, actual) + + with xr.set_options(use_flox=True): + actual = array.chunk(x=1).groupby("x").median() + assert_identical(expected, actual) + + # will work blockwise with flox + actual = array.chunk(x=3).groupby("x").median() + assert_identical(expected, actual) + + # will work blockwise with flox + actual = array.chunk(x=-1).groupby("x").median() + assert_identical(expected, actual) + + def test_da_groupby_quantile() -> None: array = xr.DataArray( data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x" diff --git a/xarray/util/generate_aggregations.py b/xarray/util/generate_aggregations.py index 3462af28663..b59dc36c108 100644 --- a/xarray/util/generate_aggregations.py +++ b/xarray/util/generate_aggregations.py @@ -19,6 +19,7 @@ MODULE_PREAMBLE = '''\ """Mixin classes with reduction operations.""" + # This file was generated using xarray.util.generate_aggregations. Do not edit manually. from __future__ import annotations @@ -245,13 +246,9 @@ def {method}( _FLOX_NOTES_TEMPLATE = """Use the ``flox`` package to significantly speed up {kind} computations, especially with dask arrays. Xarray will use flox by default if installed. Pass flox-specific keyword arguments in ``**kwargs``. -The default choice is ``method="cohorts"`` which generalizes the best, -{recco} might work better for your problem. See the `flox documentation `_ for more.""" -_FLOX_GROUPBY_NOTES = _FLOX_NOTES_TEMPLATE.format(kind="groupby", recco="other methods") -_FLOX_RESAMPLE_NOTES = _FLOX_NOTES_TEMPLATE.format( - kind="resampling", recco='``method="blockwise"``' -) +_FLOX_GROUPBY_NOTES = _FLOX_NOTES_TEMPLATE.format(kind="groupby") +_FLOX_RESAMPLE_NOTES = _FLOX_NOTES_TEMPLATE.format(kind="resampling") ExtraKwarg = collections.namedtuple("ExtraKwarg", "docs kwarg call example") skipna = ExtraKwarg( @@ -300,11 +297,13 @@ def __init__( extra_kwargs=tuple(), numeric_only=False, see_also_modules=("numpy", "dask.array"), + min_flox_version=None, ): self.name = name self.extra_kwargs = extra_kwargs self.numeric_only = numeric_only self.see_also_modules = see_also_modules + self.min_flox_version = min_flox_version if bool_reduce: self.array_method = f"array_{name}" self.np_example_array = """ @@ -443,8 +442,8 @@ def generate_code(self, method, has_keep_attrs): if self.datastructure.numeric_only: extra_kwargs.append(f"numeric_only={method.numeric_only},") - # numpy_groupies & flox do not support median - # https://github.com/ml31415/numpy-groupies/issues/43 + # median isn't enabled yet, because it would break if a single group was present in multiple + # chunks. The non-flox code path will just rechunk every group to a single chunk and execute the median method_is_not_flox_supported = method.name in ("median", "cumsum", "cumprod") if method_is_not_flox_supported: indent = 12 @@ -465,11 +464,16 @@ def generate_code(self, method, has_keep_attrs): **kwargs, )""" - else: - return f"""\ + min_version_check = f""" + and module_available("flox", minversion="{method.min_flox_version}")""" + + return ( + """\ if ( flox_available - and OPTIONS["use_flox"] + and OPTIONS["use_flox"]""" + + (min_version_check if method.min_flox_version is not None else "") + + f""" and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( @@ -486,6 +490,7 @@ def generate_code(self, method, has_keep_attrs): keep_attrs=keep_attrs, **kwargs, )""" + ) class GenericAggregationGenerator(AggregationGenerator): @@ -522,7 +527,9 @@ def generate_code(self, method, has_keep_attrs): Method("sum", extra_kwargs=(skipna, min_count), numeric_only=True), Method("std", extra_kwargs=(skipna, ddof), numeric_only=True), Method("var", extra_kwargs=(skipna, ddof), numeric_only=True), - Method("median", extra_kwargs=(skipna,), numeric_only=True), + Method( + "median", extra_kwargs=(skipna,), numeric_only=True, min_flox_version="0.9.2" + ), # Cumulatives: Method("cumsum", extra_kwargs=(skipna,), numeric_only=True), Method("cumprod", extra_kwargs=(skipna,), numeric_only=True),