From fdbff97c80cac5053414775f9e7cfd1e85718e9b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 13:43:13 -0400 Subject: [PATCH 01/10] regression test for raising clearer error --- xarray/tests/test_backends.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 95531ec5062..94ab479db2d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3401,6 +3401,16 @@ def test_open_mfdataset_auto_combine(self): with open_mfdataset([tmp2, tmp1], combine="by_coords") as actual: assert_identical(original, actual) + def test_open_mfdataset_raise_on_bad_combine_args(self): + # Regression test for unhelpful error shown in #5230 + original = Dataset({"foo": ("x", np.random.randn(10)), "x": np.arange(10)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + with pytest.raises(ValueError, match="`concat_dim` can only"): + open_mfdataset([tmp1, tmp2], concat_dim="x") + @pytest.mark.xfail(reason="mfdataset loses encoding currently.") def test_encoding_mfdataset(self): original = Dataset( From e4633f90b9f380ab7af3f3127dea36feb3c3bc74 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 13:43:40 -0400 Subject: [PATCH 02/10] raise error on bad args to open_mfdataset --- xarray/backends/api.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 01079025434..046d36a146e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -738,7 +738,7 @@ def open_mfdataset( see the full documentation for more details [2]_. concat_dim : str, or list of str, DataArray, Index or None, optional Dimensions to concatenate files along. You only need to provide this argument - if ``combine='by_coords'``, and if any of the dimensions along which you want to + if ``combine='nested'``, and if any of the dimensions along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation along a @@ -886,6 +886,10 @@ def open_mfdataset( combined_ids_paths = _infer_concat_order_from_positions(paths) ids, paths = (list(combined_ids_paths.keys()), list(combined_ids_paths.values())) + if combine == "by_coords" and concat_dim is not None: + raise ValueError("Passing a value for `concat_dim` can only be used " + "with combine='nested', not combine='by_coords'") + open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) if parallel: From 9fc29ab6156054d46df72d61477d626fbff67737 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 14:20:41 -0400 Subject: [PATCH 03/10] correct other tests to not pass by_coords and concat_dim together --- xarray/tests/test_backends.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 94ab479db2d..4da46966bb7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3012,15 +3012,12 @@ def gen_datasets_with_common_coord_and_time(self): return ds1, ds2 - @pytest.mark.parametrize("combine", ["nested", "by_coords"]) @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) @pytest.mark.parametrize("join", ["outer", "inner", "left", "right"]) - def test_open_mfdataset_does_same_as_concat(self, combine, opt, join): + def test_open_mfdataset_does_same_as_concat(self, opt, join): with self.setup_files_and_datasets() as (files, [ds1, ds2]): - if combine == "by_coords": - files.reverse() with open_mfdataset( - files, data_vars=opt, combine=combine, concat_dim="t", join=join + files, data_vars=opt, combine="nested", concat_dim="t", join=join ) as ds: ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join) assert_identical(ds, ds_expect) @@ -3066,14 +3063,14 @@ def test_open_mfdataset_dataset_combine_attrs( with pytest.raises(xr.MergeError): xr.open_mfdataset( files, - combine="by_coords", + combine="nested", concat_dim="t", combine_attrs=combine_attrs, ) else: with xr.open_mfdataset( files, - combine="by_coords", + combine="nested", concat_dim="t", combine_attrs=combine_attrs, ) as ds: @@ -3091,7 +3088,7 @@ def test_open_mfdataset_dataset_attr_by_coords(self): ds.close() ds.to_netcdf(f) - with xr.open_mfdataset(files, combine="by_coords", concat_dim="t") as ds: + with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds: assert ds.test_dataset_attr == 10 def test_open_mfdataset_dataarray_attr_by_coords(self): @@ -3106,18 +3103,15 @@ def test_open_mfdataset_dataarray_attr_by_coords(self): ds.close() ds.to_netcdf(f) - with xr.open_mfdataset(files, combine="by_coords", concat_dim="t") as ds: + with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds: assert ds["v1"].test_dataarray_attr == 0 - @pytest.mark.parametrize("combine", ["nested", "by_coords"]) @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) - def test_open_mfdataset_exact_join_raises_error(self, combine, opt): + def test_open_mfdataset_exact_join_raises_error(self, opt): with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): - if combine == "by_coords": - files.reverse() with pytest.raises(ValueError, match=r"indexes along dimension"): open_mfdataset( - files, data_vars=opt, combine=combine, concat_dim="t", join="exact" + files, data_vars=opt, combine="nested", concat_dim="t", join="exact" ) def test_common_coord_when_datavars_all(self): From b94024906862a5b5f0823cdc72bb879d61a59dfb Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 14:36:38 -0400 Subject: [PATCH 04/10] refactored to remove unneeded reordering when using combine=by_coords --- xarray/backends/api.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 046d36a146e..9aec93f5c5f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -877,18 +877,20 @@ def open_mfdataset( if not paths: raise OSError("no files to open") - # If combine='by_coords' then this is unnecessary, but quick. - # If combine='nested' then this creates a flat list which is easier to - # iterate over, while saving the originally-supplied structure as "ids" if combine == "nested": if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] - combined_ids_paths = _infer_concat_order_from_positions(paths) - ids, paths = (list(combined_ids_paths.keys()), list(combined_ids_paths.values())) - if combine == "by_coords" and concat_dim is not None: - raise ValueError("Passing a value for `concat_dim` can only be used " - "with combine='nested', not combine='by_coords'") + # This creates a flat list which is easier to iterate over, whilst + # encoding the originally-supplied structure as "ids". + # The "ids" are not used at all if combine='by_coords`. + combined_ids_paths = _infer_concat_order_from_positions(paths) + ids, paths = (list(combined_ids_paths.keys()), + list(combined_ids_paths.values())) + + elif combine == "by_coords" and concat_dim is not None: + raise ValueError("`concat_dim` can only be used with combine='nested'," + " not with combine='by_coords'") open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) From 842588ad74abade0fd18f3393fa74f7604b3c15c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 14:42:47 -0400 Subject: [PATCH 05/10] Try pre-commit From b76dd283ac1e38ccfaa6c2e87daa7d54dfef72e0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 14:48:58 -0400 Subject: [PATCH 06/10] black --- xarray/backends/api.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9aec93f5c5f..55d4631c93a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -885,12 +885,16 @@ def open_mfdataset( # encoding the originally-supplied structure as "ids". # The "ids" are not used at all if combine='by_coords`. combined_ids_paths = _infer_concat_order_from_positions(paths) - ids, paths = (list(combined_ids_paths.keys()), - list(combined_ids_paths.values())) + ids, paths = ( + list(combined_ids_paths.keys()), + list(combined_ids_paths.values()), + ) elif combine == "by_coords" and concat_dim is not None: - raise ValueError("`concat_dim` can only be used with combine='nested'," - " not with combine='by_coords'") + raise ValueError( + "`concat_dim` can only be used with combine='nested'," + " not with combine='by_coords'" + ) open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) From f1cf9f7d54b98378ce22c6d2354d1a65b4bb7007 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 15:03:41 -0400 Subject: [PATCH 07/10] manual -> nested in docstrings --- xarray/core/combine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 375931e1f9c..0f459d29d1c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -372,7 +372,7 @@ def combine_nested( To concatenate along multiple dimensions the datasets must be passed as a nested list-of-lists, with a depth equal to the length of ``concat_dims``. - ``manual_combine`` will concatenate along the top-level list first. + ``combine_nested`` will concatenate along the top-level list first. Useful for combining datasets from a set of nested directories, or for collecting the output of a simulation parallelized along multiple @@ -496,7 +496,7 @@ def combine_nested( temperature (x, y) float64 1.764 0.4002 -0.1032 ... 0.04576 -0.1872 precipitation (x, y) float64 1.868 -0.9773 0.761 ... -0.7422 0.1549 0.3782 - ``manual_combine`` can also be used to explicitly merge datasets with + ``combine_nested`` can also be used to explicitly merge datasets with different variables. For example if we have 4 datasets, which are divided along two times, and contain two different variables, we can pass ``None`` to ``concat_dim`` to specify the dimension of the nested list over which @@ -540,7 +540,7 @@ def combine_nested( if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: concat_dim = [concat_dim] - # The IDs argument tells _manual_combine that datasets aren't yet sorted + # The IDs argument tells _nested_combine that datasets aren't yet sorted return _nested_combine( datasets, concat_dims=concat_dim, @@ -583,7 +583,7 @@ def combine_by_coords( Aligns coordinates, but different variables on datasets can cause it to fail under some scenarios. In complex cases, you may need to clean up - your data and use concat/merge explicitly (also see `manual_combine`). + your data and use concat/merge explicitly (also see `combine_nested`). Works well if, for example, you have N years of data and M data variables, and each combination of a distinct time period and set of data variables is From a95762562b323eda10ff0e0ec5eb15733dfef727 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 15:03:56 -0400 Subject: [PATCH 08/10] what's new --- doc/whats-new.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6acb231bcd5..b5087893199 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -92,6 +92,13 @@ New Features expand, ``False`` to always collapse, or ``default`` to expand unless over a pre-defined limit (:pull:`5126`). By `Tom White `_. +- Prevent passing `concat_dim` to :py:func:`xarray.open_mfdataset` when + `combine='by_coords'` is specified, which should never have been possible (as + :py:func:`xarray.combine_by_coords` has no `concat_dim` argument to pass to). + Also removes unneeded internal reordering of datasets in + :py:func:`xarray.open_mfdataset` when `combine='by_coords'` is specified. + Fixes (:issue:`5230`). + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ From a5e72c9aacbf26936844840b75dd59fe7d13f1e6 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 28 Apr 2021 16:50:36 -0400 Subject: [PATCH 09/10] corrected docstring on join --- xarray/core/combine.py | 3 +-- xarray/tests/test_backends.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 0f459d29d1c..e907fc32c07 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -631,8 +631,7 @@ def combine_by_coords( refer to its values. If None, raises a ValueError if the passed Datasets do not create a complete hypercube. join : {"outer", "inner", "left", "right", "exact"}, optional - String indicating how to combine differing indexes - (excluding concat_dim) in objects + String indicating how to combine differing indexes in objects - "outer": use the union of object indexes - "inner": use the intersection of object indexes diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4da46966bb7..1f6301b26b2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3012,12 +3012,17 @@ def gen_datasets_with_common_coord_and_time(self): return ds1, ds2 + @pytest.mark.parametrize( + "combine, concat_dim", [("nested", "t"), ("by_coords", None)] + ) @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) @pytest.mark.parametrize("join", ["outer", "inner", "left", "right"]) - def test_open_mfdataset_does_same_as_concat(self, opt, join): + def test_open_mfdataset_does_same_as_concat(self, combine, concat_dim, opt, join): with self.setup_files_and_datasets() as (files, [ds1, ds2]): + if combine == "by_coords": + files.reverse() with open_mfdataset( - files, data_vars=opt, combine="nested", concat_dim="t", join=join + files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join ) as ds: ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join) assert_identical(ds, ds_expect) @@ -3106,12 +3111,19 @@ def test_open_mfdataset_dataarray_attr_by_coords(self): with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds: assert ds["v1"].test_dataarray_attr == 0 + @pytest.mark.parametrize( + "combine, concat_dim", [("nested", "t"), ("by_coords", None)] + ) @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) - def test_open_mfdataset_exact_join_raises_error(self, opt): + def test_open_mfdataset_exact_join_raises_error(self, combine, concat_dim, opt): with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): with pytest.raises(ValueError, match=r"indexes along dimension"): open_mfdataset( - files, data_vars=opt, combine="nested", concat_dim="t", join="exact" + files, + data_vars=opt, + combine=combine, + concat_dim=concat_dim, + join="exact", ) def test_common_coord_when_datavars_all(self): From 07d8ec01ef01b196e3f1d198da99c3d198e60148 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 29 Apr 2021 11:37:57 -0400 Subject: [PATCH 10/10] reinstated file reverser --- xarray/tests/test_backends.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1f6301b26b2..9ff2b09f71a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3117,6 +3117,8 @@ def test_open_mfdataset_dataarray_attr_by_coords(self): @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) def test_open_mfdataset_exact_join_raises_error(self, combine, concat_dim, opt): with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): + if combine == "by_coords": + files.reverse() with pytest.raises(ValueError, match=r"indexes along dimension"): open_mfdataset( files,