Skip to content

combine_nested dataarrays #5835

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ Bug fixes
- Fixed performance bug where ``cftime`` import attempted within various core operations if ``cftime`` not
installed (:pull:`5640`).
By `Luke Sewell <https://github.com/lusewell>`_

- Numbers are properly formatted in a plot's title (:issue:`5788`, :pull:`5789`).
By `Maxime Liquet <https://github.com/maximlt>`_.
- Fixed bug when combining named DataArrays using :py:meth:`combine_by_coords`. (:pull:`5834`).
By `Tom Nicholas <https://github.com/TomNicholas>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
148 changes: 98 additions & 50 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,14 @@ def _nested_combine(
# Check that the inferred shape is combinable
_check_shape_tile_ids(combined_ids)

# Promote any DataArrays to Datasets
for id, obj in combined_ids.items():
if isinstance(obj, DataArray):
if obj.name is None:
combined_ids[id] = obj._to_temp_dataset()
else:
combined_ids[id] = obj.to_dataset()

# Apply series of concatenate or merge operations along each dimension
combined = _combine_nd(
combined_ids,
Expand All @@ -372,11 +380,11 @@ def _nested_combine(

# Define type for arbitrarily-nested list of lists recursively
# Currently mypy cannot handle this but other linters can (https://stackoverflow.com/a/53845083/3154101)
DATASET_HYPERCUBE = Union[Dataset, Iterable["DATASET_HYPERCUBE"]] # type: ignore
DATA_HYPERCUBE = Union[Dataset, DataArray, Iterable["DATA_HYPERCUBE"]] # type: ignore


def combine_nested(
datasets: DATASET_HYPERCUBE,
datasets: DATA_HYPERCUBE,
concat_dim: Union[
str, DataArray, None, Sequence[Union[str, "DataArray", pd.Index, None]]
],
Expand All @@ -386,9 +394,9 @@ def combine_nested(
fill_value: object = dtypes.NA,
join: str = "outer",
combine_attrs: str = "drop",
) -> Dataset:
) -> Union[Dataset, DataArray]:
"""
Explicitly combine an N-dimensional grid of datasets into one by using a
Explicitly combine an N-dimensional grid of datasets (or dataarrays) into one by using a
succession of concat and merge operations along each dimension of the grid.

Does not sort the supplied datasets under any circumstances, so the
Expand Down Expand Up @@ -474,7 +482,8 @@ def combine_nested(

Returns
-------
combined : xarray.Dataset
combined : xarray.Dataset or xarray.DataArray
Will only return a DataArray in the case that all the inputs are unnamed xarray.DataArrays.

Examples
--------
Expand Down Expand Up @@ -567,31 +576,61 @@ def combine_nested(
--------
concat
merge
combine_by_coords
"""
mixed_datasets_and_arrays = any(
isinstance(obj, Dataset) for obj in iterate_nested(datasets)
) and any(
isinstance(obj, DataArray) and obj.name is None
for obj in iterate_nested(datasets)
)
if mixed_datasets_and_arrays:
raise ValueError("Can't combine datasets with unnamed arrays.")

# TODO deprecation cycle to change the name of this argument...
data_objects = datasets

if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
concat_dim = [concat_dim]

# The IDs argument tells _nested_combine that datasets aren't yet sorted
return _nested_combine(
datasets,
concat_dims=concat_dim,
compat=compat,
data_vars=data_vars,
coords=coords,
ids=False,
fill_value=fill_value,
join=join,
combine_attrs=combine_attrs,
)
objs_are_unnamed_dataarrays = [
isinstance(data_object, DataArray) and data_object.name is None
for data_object in iterate_nested(data_objects)
]
if any(objs_are_unnamed_dataarrays):
if all(objs_are_unnamed_dataarrays):
# Combine into a single larger DataArray
unnamed_arrays = data_objects

combined_temp_dataset = _nested_combine(
unnamed_arrays,
concat_dims=concat_dim,
compat=compat,
data_vars=data_vars,
coords=coords,
ids=False,
fill_value=fill_value,
join=join,
combine_attrs=combine_attrs,
)
return DataArray()._from_temp_dataset(combined_temp_dataset)
else:
# Must be a mix of unnamed dataarrays with either named dataarrays or with datasets
# Can't combine these as we wouldn't know whether to merge or concatenate the arrays
raise ValueError(
"Can't automatically combine unnamed dataarrays with either named dataarrays or datasets."
)
else:
# Promote any named DataArrays to single-variable Datasets to simplify combining
# data_objects = [
# obj.to_dataset() if isinstance(obj, DataArray) else obj
# for obj in data_objects
# ]

# The IDs argument tells _nested_combine that datasets aren't yet sorted
return _nested_combine(
data_objects,
concat_dims=concat_dim,
compat=compat,
data_vars=data_vars,
coords=coords,
ids=False,
fill_value=fill_value,
join=join,
combine_attrs=combine_attrs,
)


def vars_as_keys(ds):
Expand Down Expand Up @@ -697,7 +736,6 @@ def combine_by_coords(
----------
data_objects : sequence of xarray.Dataset or sequence of xarray.DataArray
Data objects to combine.

compat : {"identical", "equals", "broadcast_equals", "no_conflicts", "override"}, optional
String indicating how to compare variables of the same name for
potential conflicts:
Expand Down Expand Up @@ -765,6 +803,8 @@ def combine_by_coords(
Returns
-------
combined : xarray.Dataset or xarray.DataArray
Will only return a DataArray in the case that all the inputs are unnamed xarray.DataArrays.


See also
--------
Expand Down Expand Up @@ -883,33 +923,41 @@ def combine_by_coords(
if not data_objects:
return Dataset()

mixed_arrays_and_datasets = any(
isinstance(data_object, DataArray) and data_object.name is None
for data_object in data_objects
) and any(isinstance(data_object, Dataset) for data_object in data_objects)
if mixed_arrays_and_datasets:
raise ValueError("Can't automatically combine datasets with unnamed arrays.")

all_unnamed_data_arrays = all(
objs_are_unnamed_dataarrays = [
isinstance(data_object, DataArray) and data_object.name is None
for data_object in data_objects
)
if all_unnamed_data_arrays:
unnamed_arrays = data_objects
temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays]

combined_temp_dataset = _combine_single_variable_hypercube(
temp_datasets,
fill_value=fill_value,
data_vars=data_vars,
coords=coords,
compat=compat,
join=join,
combine_attrs=combine_attrs,
)
return DataArray()._from_temp_dataset(combined_temp_dataset)

]
if any(objs_are_unnamed_dataarrays):
if all(objs_are_unnamed_dataarrays):
# Combine into a single larger DataArray
unnamed_arrays = data_objects
temp_datasets = [
data_array._to_temp_dataset() for data_array in unnamed_arrays
]

combined_temp_dataset = _combine_single_variable_hypercube(
temp_datasets,
fill_value=fill_value,
data_vars=data_vars,
coords=coords,
compat=compat,
join=join,
combine_attrs=combine_attrs,
)
return DataArray()._from_temp_dataset(combined_temp_dataset)
else:
# Must be a mix of unnamed dataarrays with either named dataarrays or with datasets
# Can't combine these as we wouldn't know whether to merge or concatenate the arrays
raise ValueError(
"Can't automatically combine unnamed dataarrays with either named dataarrays or datasets."
)
else:
# Promote any named DataArrays to single-variable Datasets to simplify combining
data_objects = [
obj.to_dataset() if isinstance(obj, DataArray) else obj
for obj in data_objects
]

# Group by data vars
sorted_datasets = sorted(data_objects, key=vars_as_keys)
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
Expand Down
95 changes: 78 additions & 17 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,12 +683,12 @@ def test_nested_combine_mixed_datasets_arrays(self):
Dataset({"x": [2, 3]}),
]
with pytest.raises(
ValueError, match=r"Can't combine datasets with unnamed arrays."
ValueError, match="Can't automatically combine unnamed dataarrays with"
):
combine_nested(objs, "x")


class TestCombineAuto:
class TestCombineDatasetsbyCoords:
def test_combine_by_coords(self):
objs = [Dataset({"x": [0]}), Dataset({"x": [1]})]
actual = combine_by_coords(objs)
Expand Down Expand Up @@ -730,17 +730,6 @@ def test_combine_by_coords(self):
def test_empty_input(self):
assert_identical(Dataset(), combine_by_coords([]))

def test_combine_coords_mixed_datasets_arrays(self):
objs = [
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
Dataset({"x": [2, 3]}),
]
with pytest.raises(
ValueError,
match=r"Can't automatically combine datasets with unnamed arrays.",
):
combine_by_coords(objs)

@pytest.mark.parametrize(
"join, expected",
[
Expand Down Expand Up @@ -1044,20 +1033,92 @@ def test_combine_by_coords_incomplete_hypercube(self):
with pytest.raises(ValueError):
combine_by_coords([x1, x2, x3], fill_value=None)

def test_combine_by_coords_unnamed_arrays(self):

class TestCombineMixedObjects:
def test_combine_unnamed_named_dataarrays(self):
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_da = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

with pytest.raises(
ValueError, match="Can't automatically combine unnamed dataarrays with"
):
combine_by_coords([named_da, unnamed_da])
with pytest.raises(
ValueError, match="Can't automatically combine unnamed dataarrays with"
):
combine_nested([named_da, unnamed_da], concat_dim="x")

da = DataArray([0, 1], dims="x", coords=({"x": [0, 1]}))
ds = Dataset({"x": [2, 3]})
with pytest.raises(
ValueError,
match="Can't automatically combine unnamed dataarrays with",
):
combine_by_coords([da, ds])
with pytest.raises(
ValueError,
match="Can't automatically combine unnamed dataarrays with",
):
combine_nested([da, ds], concat_dim="x")

def test_combine_mixed_datasets_named_dataarrays(self):
da = DataArray(name="a", data=[4, 5], dims="x", coords=({"x": [0, 1]}))
ds = Dataset({"b": ("x", [2, 3])})
expected = Dataset(
{"a": ("x", [4, 5]), "b": ("x", [2, 3])}, coords={"x": ("x", [0, 1])}
)

actual = combine_by_coords([da, ds])
assert_identical(expected, actual)

actual = combine_nested([da, ds], concat_dim="x")
assert_identical(expected, actual)

def test_combine_all_unnamed_dataarrays(self):
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
expected = unnamed_array

actual = combine_by_coords([unnamed_array])
expected = unnamed_array
assert_identical(expected, actual)

actual = combine_nested([unnamed_array], concat_dim=None)
assert_identical(expected, actual)

unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

actual = combine_by_coords([unnamed_array1, unnamed_array2])
expected = DataArray(
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
)

actual = combine_by_coords([unnamed_array1, unnamed_array2])
assert_identical(expected, actual)

actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x")
assert_identical(expected, actual)

def test_combine_all_named_dataarrays(self):
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
expected = named_da.to_dataset()

actual = combine_by_coords([named_da])
assert_identical(expected, actual)

actual = combine_nested([named_da], concat_dim=None)
assert_identical(expected, actual)

named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
expected = Dataset(
{
"a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"),
"b": DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"),
}
)

actual = combine_by_coords([named_da1, named_da2])
assert_identical(expected, actual)

actual = combine_nested([named_da1, named_da2], concat_dim="x")
assert_identical(expected, actual)


Expand Down