pydata · TomNicholas · Oct 24, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -41,7 +41,9 @@
 )
 from xarray.core.dataarray import DataArray
 from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk
+from xarray.core.datatree import DataTree
 from xarray.core.indexes import Index
+from xarray.core.treenode import group_subtrees
 from xarray.core.types import NetcdfWriteModes, ZarrWriteModes
 from xarray.core.utils import is_remote_uri
 from xarray.namedarray.daskmanager import DaskManager
@@ -74,7 +76,6 @@
     T_NetcdfTypes = Literal[
         "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC"
     ]
-    from xarray.core.datatree import DataTree
 
 DATAARRAY_NAME = "__xarray_dataarray_name__"
 DATAARRAY_VARIABLE = "__xarray_dataarray_variable__"
@@ -414,6 +415,56 @@ def _dataset_from_backend_dataset(
     return ds
 
 
+def _datatree_from_backend_datatree(
+    backend_tree,
+    filename_or_obj,
+    engine,
+    chunks,
+    cache,
+    overwrite_encoded_chunks,
+    inline_array,
+    chunked_array_type,
+    from_array_kwargs,
+    **extra_tokens,
+):
+    if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
+        raise ValueError(
+            f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}."
+        )
+
+    # _protect_datatree_variables_inplace(backend_tree, cache)
+    if chunks is None:
+        tree = backend_tree
+    else:
+        tree = DataTree.from_dict(
+            {
+                path: _chunk_ds(
+                    node.dataset,
+                    filename_or_obj,
+                    engine,
+                    chunks,
+                    overwrite_encoded_chunks,
+                    inline_array,
+                    chunked_array_type,
+                    from_array_kwargs,
+                    **extra_tokens,
+                )
+                for path, [node] in group_subtrees(backend_tree)
+            }
+        )
+
+    # ds.set_close(backend_ds._close)
+
+    # Ensure source filename always stored in dataset object
+    if "source" not in tree.encoding:
+        path = getattr(filename_or_obj, "path", filename_or_obj)
+
+        if isinstance(path, str | os.PathLike):
+            tree.encoding["source"] = _normalize_path(path)
+
+    return tree
+
+
 def open_dataset(
     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
     *,
@@ -838,7 +889,22 @@ def open_dataarray(
 
 def open_datatree(
     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+    *,
     engine: T_Engine = None,
+    chunks: T_Chunks = None,
+    cache: bool | None = None,
+    decode_cf: bool | None = None,
+    mask_and_scale: bool | Mapping[str, bool] | None = None,
+    decode_times: bool | Mapping[str, bool] | None = None,
+    decode_timedelta: bool | Mapping[str, bool] | None = None,
+    use_cftime: bool | Mapping[str, bool] | None = None,
+    concat_characters: bool | Mapping[str, bool] | None = None,
+    decode_coords: Literal["coordinates", "all"] | bool | None = None,
+    drop_variables: str | Iterable[str] | None = None,
+    inline_array: bool = False,
+    chunked_array_type: str | None = None,
+    from_array_kwargs: dict[str, Any] | None = None,
+    backend_kwargs: dict[str, Any] | None = None,
     **kwargs,
 ) -> DataTree:
     """
@@ -856,17 +922,75 @@ def open_datatree(
     -------
     xarray.DataTree
     """
+    if cache is None:
+        cache = chunks is None
+
+    if backend_kwargs is not None:
+        kwargs.update(backend_kwargs)
+
     if engine is None:
         engine = plugins.guess_engine(filename_or_obj)
 
+    if from_array_kwargs is None:
+        from_array_kwargs = {}
+
     backend = plugins.get_backend(engine)
 
-    return backend.open_datatree(filename_or_obj, **kwargs)
+    decoders = _resolve_decoders_kwargs(
+        decode_cf,
+        open_backend_dataset_parameters=(),
+        mask_and_scale=mask_and_scale,
+        decode_times=decode_times,
+        decode_timedelta=decode_timedelta,
+        concat_characters=concat_characters,
+        use_cftime=use_cftime,
+        decode_coords=decode_coords,
+    )
+    overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
+
+    backend_tree = backend.open_datatree(
+        filename_or_obj,
+        drop_variables=drop_variables,
+        **decoders,
+        **kwargs,
+    )
+
+    tree = _datatree_from_backend_datatree(
+        backend_tree,
+        filename_or_obj,
+        engine,
+        chunks,
+        cache,
+        overwrite_encoded_chunks,
+        inline_array,
+        chunked_array_type,
+        from_array_kwargs,
+        drop_variables=drop_variables,
+        **decoders,
+        **kwargs,
+    )
+
+    return tree
 
 
 def open_groups(
     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+    *,
     engine: T_Engine = None,
+    chunks: T_Chunks = None,
+    cache: bool | None = None,
+    decode_cf: bool | None = None,
+    mask_and_scale: bool | Mapping[str, bool] | None = None,
+    decode_times: bool | Mapping[str, bool] | None = None,
+    decode_timedelta: bool | Mapping[str, bool] | None = None,
+    use_cftime: bool | Mapping[str, bool] | None = None,
+    concat_characters: bool | Mapping[str, bool] | None = None,
+    decode_coords: Literal["coordinates", "all"] | bool | None = None,
+    drop_variables: str | Iterable[str] | None = None,
+    inline_array: bool = False,
+    chunked_array_type: str | None = None,
+    from_array_kwargs: dict[str, Any] | None = None,
+    backend_kwargs: dict[str, Any] | None = None,
     **kwargs,
 ) -> dict[str, Dataset]:
     """
@@ -893,12 +1017,58 @@ def open_groups(
     open_datatree()
     DataTree.from_dict()
     """
+    if cache is None:
+        cache = chunks is None
+
+    if backend_kwargs is not None:
+        kwargs.update(backend_kwargs)
+
     if engine is None:
         engine = plugins.guess_engine(filename_or_obj)
 
+    if from_array_kwargs is None:
+        from_array_kwargs = {}
+
     backend = plugins.get_backend(engine)
 
-    return backend.open_groups_as_dict(filename_or_obj, **kwargs)
+    decoders = _resolve_decoders_kwargs(
+        decode_cf,
+        open_backend_dataset_parameters=(),
+        mask_and_scale=mask_and_scale,
+        decode_times=decode_times,
+        decode_timedelta=decode_timedelta,
+        concat_characters=concat_characters,
+        use_cftime=use_cftime,
+        decode_coords=decode_coords,
+    )
+    overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
+
+    backend_groups = backend.open_groups_as_dict(
+        filename_or_obj,
+        drop_variables=drop_variables,
+        **decoders,
+        **kwargs,
+    )
+
+    groups = {
+        name: _dataset_from_backend_dataset(
+            backend_ds,
+            filename_or_obj,
+            engine,
+            chunks,
+            cache,
+            overwrite_encoded_chunks,
+            inline_array,
+            chunked_array_type,
+            from_array_kwargs,
+            drop_variables=drop_variables,
+            **decoders,
+            **kwargs,
+        )
+        for name, backend_ds in backend_groups.items()
+    }
+
+    return groups
 
 
 def open_mfdataset(

diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py
@@ -11,6 +11,7 @@
 from xarray.core.datatree import DataTree
 from xarray.testing import assert_equal, assert_identical
 from xarray.tests import (
+    requires_dask,
     requires_h5netcdf,
     requires_netCDF4,
     requires_zarr,
@@ -25,6 +26,43 @@
     pass
 
 
+def diff_chunks(comparison, tree1, tree2):
+    mismatching_variables = [loc for loc, equals in comparison.items() if not equals]
+
+    variable_messages = [
+        "\n".join(
+            [
+                f"L  {path}:{name}: {tree1[path].variables[name].chunksizes}",
+                f"R  {path}:{name}: {tree2[path].variables[name].chunksizes}",
+            ]
+        )
+        for path, name in mismatching_variables
+    ]
+    return "\n".join(["Differing chunk sizes:"] + variable_messages)
+
+
+def assert_chunks_equal(actual, expected, enforce_dask=False):
+    __tracebackhide__ = True
+
+    from xarray.namedarray.pycompat import array_type
+
+    dask_array_type = array_type("dask")
+
+    comparison = {
+        (path, name): (
+            (
+                not enforce_dask
+                or isinstance(node1.variables[name].data, dask_array_type)
+            )
+            and node1.variables[name].chunksizes == node2.variables[name].chunksizes
+        )
+        for path, (node1, node2) in xr.group_subtrees(actual, expected)
+        for name in node1.variables.keys()
+    }
+
+    assert all(comparison.values()), diff_chunks(comparison, actual, expected)
+
+
 @pytest.fixture(scope="module")
 def unaligned_datatree_nc(tmp_path_factory):
     """Creates a test netCDF4 file with the following unaligned structure, writes it to a /tmp directory
@@ -170,6 +208,26 @@
         ):
             open_datatree(unaligned_datatree_nc)
 
+    @requires_dask
+    def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None:
+        filepath = tmpdir / "test.nc"
+
+        root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])})
+        set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])})
+        set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])})
+        original_tree = DataTree.from_dict(
+            {
+                "/": root_data.chunk({"x": 2, "y": 1}),
+                "/group1": set1_data.chunk({"x": 1, "y": 2}),
+                "/group2": set2_data.chunk({"x": 2, "y": 3}),
+            }
+        )
+        original_tree.to_netcdf(filepath, engine="netcdf4")
+
+        with open_datatree(filepath, engine="netcdf4", chunks={}) as tree:
+            xr.testing.assert_identical(tree, original_tree)
+            assert_chunks_equal(tree, original_tree, enforce_dask=True)
+
     def test_open_groups(self, unaligned_datatree_nc) -> None:
         """Test `open_groups` with a netCDF4 file with an unaligned group hierarchy."""
         unaligned_dict_of_datasets = open_groups(unaligned_datatree_nc)
@@ -348,6 +406,26 @@
         ):
             open_datatree(unaligned_datatree_zarr, engine="zarr")
 
+    @requires_dask
+    def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None:
+        filepath = tmpdir / "test.zarr"
+
+        root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])})
+        set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])})
+        set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])})
+        original_tree = DataTree.from_dict(
+            {
+                "/": root_data.chunk({"x": 2, "y": 1}),
+                "/group1": set1_data.chunk({"x": 1, "y": 2}),
+                "/group2": set2_data.chunk({"x": 2, "y": 3}),
+            }
+        )
+        original_tree.to_zarr(filepath)
+
+        with open_datatree(filepath, engine="zarr", chunks={}) as tree:
+            xr.testing.assert_identical(original_tree, tree)
+            assert_chunks_equal(tree, original_tree, enforce_dask=True)
+
     def test_open_groups(self, unaligned_datatree_zarr) -> None:
         """Test `open_groups` with a zarr store of an unaligned group hierarchy."""