From c703ebc74f73e3fdc33ea302425bbb689f6204f6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Aug 2023 12:03:39 +0200 Subject: [PATCH 01/11] add set_indexes parameter to open_dataset --- xarray/backends/api.py | 16 ++++++++++++++++ xarray/backends/common.py | 1 + 2 files changed, 17 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e35d85a1e2f..254ed54a3a0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -403,6 +403,7 @@ def open_dataset( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -492,6 +493,12 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + set_indexes : bool, optional + If True (default), create new indexes from coordinates. Both the number and + the type(s) of those indexes depend on the backend used to open the dataset. + For most common backends this creates a pandas index for each + :term:`Dimension coordinate`, which loads the coordinate data fully in memory. + Set it to False if you want to avoid loading data into memory. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -570,6 +577,7 @@ def open_dataset( backend_ds = backend.open_dataset( filename_or_obj, drop_variables=drop_variables, + set_indexes=set_indexes, **decoders, **kwargs, ) @@ -604,6 +612,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -695,6 +704,12 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + set_indexes : bool, optional + If True (default), create new indexes from coordinates. Both the number and + the type(s) of those indexes depend on the backend used to open the dataset. + For most common backends this creates a pandas index for each + :term:`Dimension coordinate`, which loads the coordinate data fully in memory. + Set it to False if you want to avoid loading data into memory. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -752,6 +767,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + set_indexes=set_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 1ac988c6b4f..f12057bd4af 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -490,6 +490,7 @@ def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, **kwargs: Any, ) -> Dataset: """ From 6f54cd52e8030cf79eac0c4b1b647c08de901131 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Aug 2023 12:04:50 +0200 Subject: [PATCH 02/11] implement set_indexes in (zarr) backend store --- xarray/backends/store.py | 20 ++++++++++++++++++-- xarray/backends/zarr.py | 8 ++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index a507ee37470..e15e6b08c0f 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -35,6 +36,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -55,8 +57,22 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + if set_indexes: + coords = coord_vars + else: + # explict Coordinates object with no index passed + coords = Coordinates(coord_vars) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f88523422bb..2de008b8c72 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -757,6 +757,7 @@ def open_zarr( zarr_version=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + set_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -850,6 +851,10 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + set_indexes : bool, optional + If True (default), create a default (pandas) index for each + :term:`Dimension coordinate`. Set it to False if the dataset contains + dimension coordinate arrays that are too large to load fully in memory. Returns ------- @@ -906,6 +911,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + set_indexes=set_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, @@ -950,6 +956,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, group=None, @@ -986,6 +993,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + set_indexes=set_indexes, ) return ds From 145ae1c44c4d2bdbcf74c9107172e4119a969d30 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:03:21 +0200 Subject: [PATCH 03/11] replace `set_indexes` with `create_default_indexes` --- xarray/backends/api.py | 51 +++++++++++++++++++++++++-------------- xarray/backends/common.py | 2 -- xarray/backends/store.py | 7 ++---- xarray/backends/zarr.py | 17 +++++++------ 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e4fcfe5eeb6..1b5d8e1304b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -36,6 +36,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -389,6 +390,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -397,11 +399,22 @@ def _dataset_from_backend_dataset( ) _protect_dataset_variables_inplace(backend_ds, cache) + + if create_default_indexes: + to_index = { + name: coord.variable + for name, coord in backend_ds.coords.items() + if coord.dims == (name,) and name not in backend_ds.xindexes + } + indexed = backend_ds.assign_coords(Coordinates(to_index)) + else: + indexed = backend_ds + if chunks is None: - ds = backend_ds + ds = indexed else: ds = _chunk_ds( - backend_ds, + indexed, filename_or_obj, engine, chunks, @@ -497,7 +510,7 @@ def open_dataset( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -611,12 +624,13 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - set_indexes : bool, optional - If True (default), create new indexes from coordinates. Both the number and - the type(s) of those indexes depend on the backend used to open the dataset. - For most common backends this creates a pandas index for each - :term:`Dimension coordinate`, which loads the coordinate data fully in memory. - Set it to False if you want to avoid loading data into memory. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -695,7 +709,6 @@ def open_dataset( backend_ds = backend.open_dataset( filename_or_obj, drop_variables=drop_variables, - set_indexes=set_indexes, **decoders, **kwargs, ) @@ -710,6 +723,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -733,7 +747,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -842,12 +856,13 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - set_indexes : bool, optional - If True (default), create new indexes from coordinates. Both the number and - the type(s) of those indexes depend on the backend used to open the dataset. - For most common backends this creates a pandas index for each - :term:`Dimension coordinate`, which loads the coordinate data fully in memory. - Set it to False if you want to avoid loading data into memory. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -905,7 +920,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, - set_indexes=set_indexes, + create_default_indexes=create_default_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 9ef0da81659..e1f8dc5cecd 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -697,8 +697,6 @@ def open_dataset( filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, - **kwargs: Any, ) -> Dataset: """ Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. diff --git a/xarray/backends/store.py b/xarray/backends/store.py index 0b34ea7ed8c..7edfbd1c4e0 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -67,11 +67,8 @@ def open_dataset( else: data_vars[name] = var - if set_indexes: - coords = coord_vars - else: - # explicit Coordinates object with no index passed - coords = Coordinates(coord_vars) + # explicit Coordinates object with no index passed + coords = Coordinates(coord_vars) ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f8ff1fa6b7f..dc78194dcd2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1347,7 +1347,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, - set_indexes=True, + create_default_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1458,10 +1458,13 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. - set_indexes : bool, optional - If True (default), create a default (pandas) index for each - :term:`Dimension coordinate`. Set it to False if the dataset contains - dimension coordinate arrays that are too large to load fully in memory. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. Returns ------- @@ -1518,7 +1521,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, - set_indexes=set_indexes, + create_default_indexes=create_default_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, @@ -1564,7 +1567,6 @@ def open_dataset( concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, use_cftime=None, decode_timedelta=None, group=None, @@ -1608,7 +1610,6 @@ def open_dataset( drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, - set_indexes=set_indexes, ) return ds From 192c367f47c14a1d4e1837e61702eedda07c4a8a Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:20:55 +0200 Subject: [PATCH 04/11] make sure indexes set by the backend survive --- xarray/tests/test_backends_api.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..885b8cb8a46 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -201,3 +201,39 @@ def test_join_chunks(self, shape, pref_chunks, req_chunks): chunks=dict(zip(initial[self.var_name].dims, req_chunks, strict=True)), ) self.check_dataset(initial, final, explicit_chunks(req_chunks, shape)) + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes(self, create_default_indexes): + """Create default indexes if the backend does not create them.""" + coords = xr.Coordinates({"x": ("x", [0, 1]), "y": list("abc")}, indexes={}) + initial = xr.Dataset({"a": ("x", [1, 2])}, coords=coords) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + if create_default_indexes: + assert all(name in final.xindexes for name in ["x", "y"]) + else: + assert not final.xindexes + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes_passthrough(self, create_default_indexes): + """Allow creating indexes in the backend.""" + + initial = xr.Dataset( + {"a": (["x", "y"], [[1, 2, 3], [4, 5, 6]])}, + coords={"x": ("x", [0, 1]), "y": ("y", list("abc"))}, + ).stack(z=["x", "y"]) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + assert initial.coords.equals(final.coords) From f5823a73107643ee1206023cbdb2086f1c375437 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:27:01 +0200 Subject: [PATCH 05/11] also add the parameter to `open_datatree` --- xarray/backends/api.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1b5d8e1304b..e3582370cb4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1206,6 +1206,7 @@ def open_groups( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1317,6 +1318,13 @@ def open_groups( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1412,6 +1420,7 @@ def open_groups( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) From 2ff8402192a3afd443e155e5a3c10d592475dce7 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:37:23 +0200 Subject: [PATCH 06/11] share the implementation of the default indexes creation --- xarray/backends/api.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e3582370cb4..0ac1a4d9503 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -380,6 +380,18 @@ def _chunk_ds( return backend_ds._replace(variables) +def _create_default_indexes(ds, create_default_indexes): + if not create_default_indexes: + return ds + + to_index = { + name: coord.variable + for name, coord in ds.coords.items() + if coord.dims == (name,) and name not in ds.xindexes + } + return ds.assign_coords(Coordinates(to_index)) + + def _dataset_from_backend_dataset( backend_ds, filename_or_obj, @@ -400,15 +412,7 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) - if create_default_indexes: - to_index = { - name: coord.variable - for name, coord in backend_ds.coords.items() - if coord.dims == (name,) and name not in backend_ds.xindexes - } - indexed = backend_ds.assign_coords(Coordinates(to_index)) - else: - indexed = backend_ds + indexed = _create_default_indexes(backend_ds, create_default_indexes) if chunks is None: ds = indexed @@ -447,6 +451,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -461,7 +466,7 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset, + node.dataset.pipe(_create_default_indexes, create_default_indexes), filename_or_obj, engine, chunks, @@ -977,6 +982,7 @@ def open_datatree( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1086,6 +1092,13 @@ def open_datatree( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1179,6 +1192,7 @@ def open_datatree( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) From 294b2f712425ca9424a4d6ad62912d653087beb5 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 2 Jul 2025 00:00:11 +0200 Subject: [PATCH 07/11] check that the store backend entrypoint does not create default indexes --- xarray/tests/test_backends.py | 61 +++++++++++++++++++++++++++++++ xarray/tests/test_backends_api.py | 2 +- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 733188dde1e..92617b5d7b7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type @@ -2050,6 +2051,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): with self.roundtrip(original): pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -4009,6 +4030,26 @@ def test_pickle(self) -> None: def test_pickle_dataarray(self) -> None: pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_scipy class TestScipyFilePath(CFEncodedBase, NetCDF3Only): @@ -6378,6 +6419,26 @@ def test_zarr_closing_internal_zip_store(): assert_identical(original_da, loaded_da) +@requires_zarr +@pytest.mark.parametrize("create_default_indexes", [True, False]) +def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + + store_path = tmp_path / "tmp.zarr" + original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) + original_ds.to_zarr(store_path, mode="w") + + with open_dataset( + store_path, engine="zarr", create_default_indexes=create_default_indexes + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + + @requires_zarr @pytest.mark.usefixtures("default_zarr_format") def test_raises_key_error_on_invalid_zarr_store(tmp_path): diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 885b8cb8a46..778e800ec67 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -218,7 +218,7 @@ def test_default_indexes(self, create_default_indexes): if create_default_indexes: assert all(name in final.xindexes for name in ["x", "y"]) else: - assert not final.xindexes + assert len(final.xindexes) == 0 @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_default_indexes_passthrough(self, create_default_indexes): From 5c3a8437839ef22d2cdd9c907b00e7bfb1c775bd Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 2 Jul 2025 00:01:50 +0200 Subject: [PATCH 08/11] actually do not create default indexes in the backends --- xarray/backends/store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index 7edfbd1c4e0..de52aa193ed 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -68,7 +68,7 @@ def open_dataset( data_vars[name] = var # explicit Coordinates object with no index passed - coords = Coordinates(coord_vars) + coords = Coordinates(coord_vars, indexes={}) ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) From 08939dee3857913711098f1ddce52fc93ad43e47 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 2 Jul 2025 00:02:56 +0200 Subject: [PATCH 09/11] rename the helper --- xarray/backends/api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0ac1a4d9503..38f96270c99 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -380,7 +380,7 @@ def _chunk_ds( return backend_ds._replace(variables) -def _create_default_indexes(ds, create_default_indexes): +def _maybe_create_default_indexes(ds, create_default_indexes): if not create_default_indexes: return ds @@ -412,7 +412,7 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) - indexed = _create_default_indexes(backend_ds, create_default_indexes) + indexed = _maybe_create_default_indexes(backend_ds, create_default_indexes) if chunks is None: ds = indexed @@ -466,7 +466,9 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset.pipe(_create_default_indexes, create_default_indexes), + node.dataset.pipe( + _maybe_create_default_indexes, create_default_indexes + ), filename_or_obj, engine, chunks, From 95dbf8e6e787d15f1a649eafa492e9e7e6c6c4e8 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 3 Jul 2025 19:08:37 +0200 Subject: [PATCH 10/11] move the handling of `create_default_indexes` up the call stack --- xarray/backends/api.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 38f96270c99..cb4ef3fa813 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -380,10 +380,7 @@ def _chunk_ds( return backend_ds._replace(variables) -def _maybe_create_default_indexes(ds, create_default_indexes): - if not create_default_indexes: - return ds - +def _maybe_create_default_indexes(ds): to_index = { name: coord.variable for name, coord in ds.coords.items() @@ -412,13 +409,14 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) - indexed = _maybe_create_default_indexes(backend_ds, create_default_indexes) + if create_default_indexes: + backend_ds = _maybe_create_default_indexes(backend_ds) if chunks is None: - ds = indexed + ds = backend_ds else: ds = _chunk_ds( - indexed, + backend_ds, filename_or_obj, engine, chunks, @@ -466,8 +464,10 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset.pipe( - _maybe_create_default_indexes, create_default_indexes + ( + _maybe_create_default_indexes(node.dataset) + if create_default_indexes + else node.dataset ), filename_or_obj, engine, From d7e6daa7f706234133a4bf160583668622cfec1c Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 3 Jul 2025 19:12:06 +0200 Subject: [PATCH 11/11] what's new --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ad83cfac531..8c0f5cf635c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,7 +12,8 @@ v2025.07.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`). + By `Benoit Bovy `_ and `Justus Magin `_. Breaking changes ~~~~~~~~~~~~~~~~