-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
combine_by_coordinates to handle unnamed data arrays. #4696
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b35de8e
f966e76
68b7b49
540961f
1c9b4c2
cb5ed5e
77020c0
6af896b
f06371a
7cdeabb
6190839
3055000
cbc002f
11a868b
89ac962
5f3afa5
feb90ce
db5b906
e884f52
0044bb9
44548ee
5fe8323
55f53b9
805145c
3eed47a
05faa88
6c75525
2c43030
f6fae25
81ec1ff
caaee74
637d4cc
b5940a1
d02da23
04cd5f8
e58a9e2
c0fc4f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,4 +1,5 @@ | ||||||
import itertools | ||||||
import warnings | ||||||
from collections import Counter | ||||||
|
||||||
import pandas as pd | ||||||
|
@@ -8,6 +9,7 @@ | |||||
from .dataarray import DataArray | ||||||
from .dataset import Dataset | ||||||
from .merge import merge | ||||||
from .utils import iterate_nested | ||||||
|
||||||
|
||||||
def _infer_concat_order_from_positions(datasets): | ||||||
|
@@ -544,6 +546,15 @@ def combine_nested( | |||||
concat | ||||||
merge | ||||||
""" | ||||||
mixed_datasets_and_arrays = any( | ||||||
isinstance(obj, Dataset) for obj in iterate_nested(datasets) | ||||||
) and any( | ||||||
isinstance(obj, DataArray) and obj.name is None | ||||||
for obj in iterate_nested(datasets) | ||||||
) | ||||||
if mixed_datasets_and_arrays: | ||||||
raise ValueError("Can't combine datasets with unnamed arrays.") | ||||||
|
||||||
if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: | ||||||
concat_dim = [concat_dim] | ||||||
|
||||||
|
@@ -565,18 +576,79 @@ def vars_as_keys(ds): | |||||
return tuple(sorted(ds)) | ||||||
|
||||||
|
||||||
def combine_by_coords( | ||||||
def _combine_single_variable_hypercube( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a nice way to refactor, that makes it easier to reason about the code. |
||||||
datasets, | ||||||
fill_value=dtypes.NA, | ||||||
data_vars="all", | ||||||
coords="different", | ||||||
compat="no_conflicts", | ||||||
join="outer", | ||||||
combine_attrs="no_conflicts", | ||||||
): | ||||||
""" | ||||||
Attempt to combine a list of Datasets into a hypercube using their | ||||||
coordinates. | ||||||
|
||||||
All provided Datasets must belong to a single variable, ie. must be | ||||||
assigned the same variable name. This precondition is not checked by this | ||||||
function, so the caller is assumed to know what it's doing. | ||||||
|
||||||
This function is NOT part of the public API. | ||||||
""" | ||||||
if len(datasets) == 0: | ||||||
raise ValueError( | ||||||
"At least one Dataset is required to resolve variable names " | ||||||
"for combined hypercube." | ||||||
) | ||||||
Comment on lines
+598
to
+602
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this Error ever be reached? Wouldn't the if not data_objects:
return Dataset() stop this from being triggered? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intended this check to be part of the internal contract for |
||||||
|
||||||
combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets)) | ||||||
|
||||||
if fill_value is None: | ||||||
# check that datasets form complete hypercube | ||||||
_check_shape_tile_ids(combined_ids) | ||||||
else: | ||||||
# check only that all datasets have same dimension depth for these | ||||||
# vars | ||||||
_check_dimension_depth_tile_ids(combined_ids) | ||||||
|
||||||
# Concatenate along all of concat_dims one by one to create single ds | ||||||
concatenated = _combine_nd( | ||||||
combined_ids, | ||||||
concat_dims=concat_dims, | ||||||
data_vars=data_vars, | ||||||
coords=coords, | ||||||
compat=compat, | ||||||
fill_value=fill_value, | ||||||
join=join, | ||||||
combine_attrs=combine_attrs, | ||||||
) | ||||||
|
||||||
# Check the overall coordinates are monotonically increasing | ||||||
for dim in concat_dims: | ||||||
indexes = concatenated.indexes.get(dim) | ||||||
if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing): | ||||||
raise ValueError( | ||||||
"Resulting object does not have monotonic" | ||||||
" global indexes along dimension {}".format(dim) | ||||||
) | ||||||
|
||||||
return concatenated | ||||||
|
||||||
|
||||||
# TODO remove empty list default param after version 0.19, see PR4696 | ||||||
def combine_by_coords( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we also modify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wrote a test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I know what @dcherian meant - at first glance it looks like the |
||||||
data_objects=[], | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
(It's considered bad practice to have mutable default arguments to functions in python.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I put this in because if someone calls this method with datasets as a named parameter the data_objects argument would be unspecified and their code would break with an unspecified argument error. This is part of the deprecation warning below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You make a good point, but that means the default argument should be |
||||||
compat="no_conflicts", | ||||||
data_vars="all", | ||||||
coords="different", | ||||||
fill_value=dtypes.NA, | ||||||
join="outer", | ||||||
combine_attrs="no_conflicts", | ||||||
datasets=None, | ||||||
): | ||||||
""" | ||||||
Attempt to auto-magically combine the given datasets into one by using | ||||||
dimension coordinates. | ||||||
Attempt to auto-magically combine the given datasets (or data arrays) | ||||||
into one by using dimension coordinates. | ||||||
|
||||||
This method attempts to combine a group of datasets along any number of | ||||||
dimensions into a single entity by inspecting coords and metadata and using | ||||||
|
@@ -600,8 +672,9 @@ def combine_by_coords( | |||||
|
||||||
Parameters | ||||||
---------- | ||||||
datasets : sequence of xarray.Dataset | ||||||
Dataset objects to combine. | ||||||
data_objects : sequence of xarray.Dataset or sequence of xarray.DataArray | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is renaming a breaking change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It technically is a breaking change - if someone was previously passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it probably does. But only a few extra lines and easy to copy from elsewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have no idea what justifies a deprecation cycle in your project, or how one is performed. Can someone give me some guidance on this, seeing as this change will probably need one according to @max-sixty? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No worries! (It's very briefly mentioned in our contributing guide, but maybe we should expand that...) xarray has loads of regular users, and we don't want them to find that downloading a new version of xarray breaks their perfectly good code, even in some minor way. Therefore we normally hold their hand through any changes by warning them in the version before, or possibly by anticipating the way in which we might need to make sure that their old way of using the functions still works temporarily, to give them time to switch to our new way. In this case the only people who could be affected are people who are currently passing import warnings
def combine_by_coords(data_objects, ..., datasets=None):
# TODO remove after version 0.19, see PR4696
if datasets is not None:
warnings.warn("The datasets argument has been renamed to `data_objects`. In future passing a value for datasets will raise an error.")
data_objects = datasets Does that make sense? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. I'm somewhat concerned about users who ignore the warning (many software projects I've seen generate a lot of warnings), however I don't think there's much that can be done since the method signature will have to change anyway. |
||||||
Data objects to combine. | ||||||
|
||||||
compat : {"identical", "equals", "broadcast_equals", "no_conflicts", "override"}, optional | ||||||
String indicating how to compare variables of the same name for | ||||||
potential conflicts: | ||||||
|
@@ -776,51 +849,62 @@ def combine_by_coords( | |||||
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176 | ||||||
""" | ||||||
|
||||||
# Group by data vars | ||||||
sorted_datasets = sorted(datasets, key=vars_as_keys) | ||||||
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) | ||||||
|
||||||
# Perform the multidimensional combine on each group of data variables | ||||||
# before merging back together | ||||||
concatenated_grouped_by_data_vars = [] | ||||||
for vars, datasets_with_same_vars in grouped_by_vars: | ||||||
combined_ids, concat_dims = _infer_concat_order_from_coords( | ||||||
list(datasets_with_same_vars) | ||||||
# TODO remove after version 0.19, see PR4696 | ||||||
if datasets is not None: | ||||||
warnings.warn( | ||||||
"The datasets argument has been renamed to `data_objects`." | ||||||
" In future passing a value for datasets will raise an error." | ||||||
) | ||||||
data_objects = datasets | ||||||
|
||||||
if fill_value is None: | ||||||
# check that datasets form complete hypercube | ||||||
_check_shape_tile_ids(combined_ids) | ||||||
else: | ||||||
# check only that all datasets have same dimension depth for these | ||||||
# vars | ||||||
_check_dimension_depth_tile_ids(combined_ids) | ||||||
if not data_objects: | ||||||
return Dataset() | ||||||
|
||||||
# Concatenate along all of concat_dims one by one to create single ds | ||||||
concatenated = _combine_nd( | ||||||
combined_ids, | ||||||
concat_dims=concat_dims, | ||||||
mixed_arrays_and_datasets = any( | ||||||
isinstance(data_object, DataArray) and data_object.name is None | ||||||
for data_object in data_objects | ||||||
) and any(isinstance(data_object, Dataset) for data_object in data_objects) | ||||||
if mixed_arrays_and_datasets: | ||||||
raise ValueError("Can't automatically combine datasets with unnamed arrays.") | ||||||
|
||||||
all_unnamed_data_arrays = all( | ||||||
isinstance(data_object, DataArray) and data_object.name is None | ||||||
for data_object in data_objects | ||||||
) | ||||||
if all_unnamed_data_arrays: | ||||||
unnamed_arrays = data_objects | ||||||
temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays] | ||||||
|
||||||
combined_temp_dataset = _combine_single_variable_hypercube( | ||||||
temp_datasets, | ||||||
fill_value=fill_value, | ||||||
data_vars=data_vars, | ||||||
coords=coords, | ||||||
compat=compat, | ||||||
fill_value=fill_value, | ||||||
join=join, | ||||||
combine_attrs=combine_attrs, | ||||||
) | ||||||
return DataArray()._from_temp_dataset(combined_temp_dataset) | ||||||
|
||||||
# Check the overall coordinates are monotonically increasing | ||||||
# TODO (benbovy - flexible indexes): only with pandas.Index? | ||||||
for dim in concat_dims: | ||||||
indexes = concatenated.xindexes.get(dim) | ||||||
if not ( | ||||||
indexes.array.is_monotonic_increasing | ||||||
or indexes.array.is_monotonic_decreasing | ||||||
): | ||||||
raise ValueError( | ||||||
"Resulting object does not have monotonic" | ||||||
" global indexes along dimension {}".format(dim) | ||||||
) | ||||||
concatenated_grouped_by_data_vars.append(concatenated) | ||||||
else: | ||||||
# Group by data vars | ||||||
sorted_datasets = sorted(data_objects, key=vars_as_keys) | ||||||
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) | ||||||
|
||||||
# Perform the multidimensional combine on each group of data variables | ||||||
# before merging back together | ||||||
concatenated_grouped_by_data_vars = [] | ||||||
for vars, datasets_with_same_vars in grouped_by_vars: | ||||||
concatenated = _combine_single_variable_hypercube( | ||||||
list(datasets_with_same_vars), | ||||||
fill_value=fill_value, | ||||||
data_vars=data_vars, | ||||||
coords=coords, | ||||||
compat=compat, | ||||||
join=join, | ||||||
combine_attrs=combine_attrs, | ||||||
) | ||||||
concatenated_grouped_by_data_vars.append(concatenated) | ||||||
|
||||||
return merge( | ||||||
concatenated_grouped_by_data_vars, | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -646,6 +646,47 @@ def test_combine_nested_fill_value(self, fill_value): | |||||
actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value) | ||||||
assert_identical(expected, actual) | ||||||
|
||||||
def test_combine_nested_unnamed_data_arrays(self): | ||||||
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") | ||||||
|
||||||
actual = combine_nested([unnamed_array], concat_dim="x") | ||||||
expected = unnamed_array | ||||||
assert_identical(expected, actual) | ||||||
|
||||||
unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") | ||||||
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") | ||||||
|
||||||
actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x") | ||||||
expected = DataArray( | ||||||
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x" | ||||||
) | ||||||
assert_identical(expected, actual) | ||||||
|
||||||
da1 = DataArray(data=[[0.0]], coords={"x": [0], "y": [0]}, dims=["x", "y"]) | ||||||
da2 = DataArray(data=[[1.0]], coords={"x": [0], "y": [1]}, dims=["x", "y"]) | ||||||
da3 = DataArray(data=[[2.0]], coords={"x": [1], "y": [0]}, dims=["x", "y"]) | ||||||
da4 = DataArray(data=[[3.0]], coords={"x": [1], "y": [1]}, dims=["x", "y"]) | ||||||
objs = [[da1, da2], [da3, da4]] | ||||||
|
||||||
expected = DataArray( | ||||||
data=[[0.0, 1.0], [2.0, 3.0]], | ||||||
coords={"x": [0, 1], "y": [0, 1]}, | ||||||
dims=["x", "y"], | ||||||
) | ||||||
actual = combine_nested(objs, concat_dim=["x", "y"]) | ||||||
assert_identical(expected, actual) | ||||||
|
||||||
# TODO aijams - Determine if this test is appropriate. | ||||||
def test_nested_combine_mixed_datasets_arrays(self): | ||||||
objs = [ | ||||||
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})), | ||||||
Dataset({"x": [2, 3]}), | ||||||
] | ||||||
with pytest.raises( | ||||||
ValueError, match=r"Can't combine datasets with unnamed arrays." | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Tiny clarification that this means datasets with other xarray.datarrays, not something about the numpy arrays inside the xarray.dataset objects. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good clarification. |
||||||
): | ||||||
combine_nested(objs, "x") | ||||||
|
||||||
|
||||||
class TestCombineAuto: | ||||||
def test_combine_by_coords(self): | ||||||
|
@@ -689,6 +730,17 @@ def test_combine_by_coords(self): | |||||
def test_empty_input(self): | ||||||
assert_identical(Dataset(), combine_by_coords([])) | ||||||
|
||||||
def test_combine_coords_mixed_datasets_arrays(self): | ||||||
objs = [ | ||||||
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})), | ||||||
Dataset({"x": [2, 3]}), | ||||||
] | ||||||
with pytest.raises( | ||||||
ValueError, | ||||||
match=r"Can't automatically combine datasets with unnamed arrays.", | ||||||
): | ||||||
combine_by_coords(objs) | ||||||
|
||||||
@pytest.mark.parametrize( | ||||||
"join, expected", | ||||||
[ | ||||||
|
@@ -992,6 +1044,22 @@ def test_combine_by_coords_incomplete_hypercube(self): | |||||
with pytest.raises(ValueError): | ||||||
combine_by_coords([x1, x2, x3], fill_value=None) | ||||||
|
||||||
def test_combine_by_coords_unnamed_arrays(self): | ||||||
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") | ||||||
|
||||||
actual = combine_by_coords([unnamed_array]) | ||||||
expected = unnamed_array | ||||||
assert_identical(expected, actual) | ||||||
|
||||||
unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") | ||||||
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") | ||||||
|
||||||
actual = combine_by_coords([unnamed_array1, unnamed_array2]) | ||||||
expected = DataArray( | ||||||
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x" | ||||||
) | ||||||
assert_identical(expected, actual) | ||||||
|
||||||
|
||||||
@requires_cftime | ||||||
def test_combine_by_coords_distant_cftime_dates(): | ||||||
|
Uh oh!
There was an error while loading. Please reload this page.