From f4c2b3f07ead8e2d635611e61843f96534bd0878 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 11 Nov 2020 05:29:35 +0100 Subject: [PATCH 01/21] refactor apiv2.open_dataset --- xarray/backends/apiv2.py | 86 +++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 7e4605c42ce..ee967cbb007 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -10,37 +10,33 @@ ) -def dataset_from_backend_dataset( - ds, +def _get_mtime(filename_or_obj): + # if passed an actual file path, augment the token with + # the file modification time + if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): + mtime = os.path.getmtime(filename_or_obj) + else: + mtime = None + return mtime + + +def _chunk_ds( + backend_ds, filename_or_obj, engine, chunks, - cache, overwrite_encoded_chunks, **extra_tokens, ): - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - _protect_dataset_variables_inplace(ds, cache) - if chunks is not None and engine != "zarr": + if engine != "zarr": from dask.base import tokenize - # if passed an actual file path, augment the token with - # the file modification time - if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): - mtime = os.path.getmtime(filename_or_obj) - else: - mtime = None + mtime = _get_mtime(filename_or_obj) token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) name_prefix = "open_dataset-%s" % token - ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) + ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token) - elif engine == "zarr": + else: if chunks == "auto": try: @@ -48,28 +44,53 @@ def dataset_from_backend_dataset( except ImportError: chunks = None - if chunks is None: - return ds - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) + chunks = dict.fromkeys(backend_ds.dims, chunks) variables = { k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in ds.variables.items() + for k, v in backend_ds.variables.items() } - ds2 = ds._replace(variables) + ds = backend_ds._replace(variables) + ds._file_obj = backend_ds._file_obj + return ds + +def dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + **extra_tokens, +): + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + _protect_dataset_variables_inplace(backend_ds, cache) + if chunks is None: + ds = backend_ds else: - ds2 = ds - ds2._file_obj = ds._file_obj + ds = _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, + ) # Ensure source filename always stored in dataset object (GH issue #2550) if "source" not in ds.encoding: if isinstance(filename_or_obj, str): - ds2.encoding["source"] = filename_or_obj + ds.encoding["source"] = filename_or_obj - return ds2 + return ds def resolve_decoders_kwargs(decode_cf, engine, **decoders): @@ -236,12 +257,13 @@ def open_dataset( open_backend_dataset = _get_backend_cls(engine, engines=plugins.ENGINES)[ "open_dataset" ] + filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} backend_ds = open_backend_dataset( filename_or_obj, drop_variables=drop_variables, **decoders, **backend_kwargs, - **{k: v for k, v in kwargs.items() if v is not None}, + **filtered_kwargs ) ds = dataset_from_backend_dataset( backend_ds, @@ -253,7 +275,7 @@ def open_dataset( drop_variables=drop_variables, **decoders, **backend_kwargs, - **kwargs, + **filtered_kwargs, ) return ds From 35663927dbf692b102d1c9797a53657211bafcff Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 11 Nov 2020 06:14:32 +0100 Subject: [PATCH 02/21] move maybe_chunks and get_chunks in dataset.py --- xarray/backends/api.py | 4 ++-- xarray/backends/apiv2.py | 45 ++++++++++++++++++-------------------- xarray/backends/zarr.py | 47 ---------------------------------------- xarray/core/dataset.py | 34 +++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 73 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0b9b5046cb9..f1d58813958 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,7 @@ combine_by_coords, ) from ..core.dataarray import DataArray -from ..core.dataset import Dataset, _maybe_chunk +from ..core.dataset import Dataset, _get_chunk, _maybe_chunk from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler @@ -536,7 +536,7 @@ def maybe_decode_store(store, chunks): k: _maybe_chunk( k, v, - store.get_chunk(k, v, chunks), + _get_chunk(k, v, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, ) for k, v in ds.variables.items() diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index ee967cbb007..3f43684b892 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,7 +1,8 @@ import os +from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri -from . import plugins, zarr +from . import plugins from .api import ( _autodetect_engine, _get_backend_cls, @@ -28,29 +29,25 @@ def _chunk_ds( overwrite_encoded_chunks, **extra_tokens, ): - if engine != "zarr": - from dask.base import tokenize - - mtime = _get_mtime(filename_or_obj) - token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = "open_dataset-%s" % token - ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token) - - else: - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - if isinstance(chunks, int): - chunks = dict.fromkeys(backend_ds.dims, chunks) - - variables = { - k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in backend_ds.variables.items() - } + from dask.base import tokenize + + mtime = _get_mtime(filename_or_obj) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) + name_prefix = "open_dataset-%s" % token + if isinstance(chunks, int): + chunks = dict.fromkeys(backend_ds.dims, chunks) + + variables = {} + for name, var in backend_ds.variables.items(): + var_chunks = _get_chunk(name, var, chunks) + variables[name] = _maybe_chunk( + name, + var, + var_chunks, + overwrite_encoded_chunks=overwrite_encoded_chunks, + name_prefix=name_prefix, + token=token, + ) ds = backend_ds._replace(variables) ds._file_obj = backend_ds._file_obj return ds diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9827c345239..22cf2b1fa7e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -368,53 +368,6 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) - @staticmethod - def get_chunk(name, var, chunks): - chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) - - # Coordinate labels aren't chunked - if var.ndim == 1 and var.dims[0] == name: - return chunk_spec - - if chunks == "auto": - return chunk_spec - - for dim in var.dims: - if dim in chunks: - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if isinstance(spec, (tuple, list)) and chunk_spec[dim]: - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate Zarr chunk shape %r for " - "dimension %r. This significantly " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - chunk_spec[dim] = chunks[dim] - return chunk_spec - - @classmethod - def maybe_chunk(cls, name, var, chunks, overwrite_encoded_chunks): - chunk_spec = cls.get_chunk(name, var, chunks) - - if (var.ndim > 0) and (chunk_spec is not None): - from dask.base import tokenize - - # does this cause any data to be read? - token2 = tokenize(name, var._data, chunks) - name2 = f"xarray-{name}-{token2}" - var = var.chunk(chunk_spec, name=name2, lock=None) - if overwrite_encoded_chunks and var.chunks is not None: - var.encoding["chunks"] = tuple(x[0] for x in var.chunks) - return var - else: - return var - def store( self, variables, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 04974c58113..86aa531e2a7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -359,6 +359,40 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) +def _check_chunks_compatibility(dim, chunks, chunk_spec): + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate on disks chunk shape %r for " + "dimension %r. This could " + "degrades performance. Consider " + "rechunking after loading instead." % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + + +def _get_chunk(name, var, chunks): + if chunks == "auto": + chunks = {} + + preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) + if var.ndim == 1 and var.dims[0] == name: + return preferred_chunks + + output_chunks = {} + if chunks is not None: + for dim in preferred_chunks: + if dim in chunks: + _check_chunks_compatibility(dim, chunks, preferred_chunks) + output_chunks[dim] = chunks[dim] + else: + output_chunks[dim] = preferred_chunks[dim] + return output_chunks + + def _maybe_chunk( name, var, From 7264648d1075cbfca162df9ac332f5bcc6f3d251 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 11 Nov 2020 09:30:11 +0100 Subject: [PATCH 03/21] - modify key for zarr autochunking from chunks='auto' to chunks={} --- xarray/backends/zarr.py | 2 +- xarray/core/dataset.py | 4 ++-- xarray/tests/test_backends.py | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 22cf2b1fa7e..97fe2b9b1fd 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -509,7 +509,7 @@ def open_zarr( store, group=None, synchronizer=None, - chunks="auto", + chunks={}, decode_cf=True, mask_and_scale=True, decode_times=True, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 86aa531e2a7..cd127e1206d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -376,7 +376,7 @@ def _check_chunks_compatibility(dim, chunks, chunk_spec): def _get_chunk(name, var, chunks): if chunks == "auto": - chunks = {} + return chunks preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) if var.ndim == 1 and var.dims[0] == name: @@ -396,7 +396,7 @@ def _get_chunk(name, var, chunks): def _maybe_chunk( name, var, - chunks=None, + chunks, token=None, lock=None, name_prefix="xarray-", diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 43bf2de245b..c4b7cea8926 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1567,7 +1567,7 @@ def roundtrip( if save_kwargs is None: save_kwargs = {} if open_kwargs is None: - open_kwargs = {"chunks": "auto"} + open_kwargs = {"chunks": {}} with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: @@ -1604,7 +1604,7 @@ def test_auto_chunk(self): # there should be no chunks assert v.chunks is None - with self.roundtrip(original, open_kwargs={"chunks": "auto"}) as actual: + with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1701,7 +1701,7 @@ def test_deprecate_auto_chunk(self): def test_write_uneven_dask_chunks(self): # regression for GH#2225 original = create_test_data().chunk({"dim1": 3, "dim2": 4, "dim3": 3}) - with self.roundtrip(original, open_kwargs={"chunks": "auto"}) as actual: + with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual: for k, v in actual.data_vars.items(): print(k) assert v.chunks == actual[k].chunks @@ -1851,7 +1851,7 @@ def test_write_persistence_modes(self, group): ds_to_append.to_zarr(store_target, append_dim="time", group=group) original = xr.concat([ds, ds_to_append], dim="time") actual = xr.open_dataset( - store_target, group=group, chunks="auto", engine="zarr" + store_target, group=group, chunks={}, engine="zarr" ) assert_identical(original, actual) @@ -1941,11 +1941,11 @@ def test_check_encoding_is_consistent_after_append(self): encoding = {"da": {"compressor": compressor}} ds.to_zarr(store_target, mode="w", encoding=encoding) ds_to_append.to_zarr(store_target, append_dim="time") - actual_ds = xr.open_dataset(store_target, chunks="auto", engine="zarr") + actual_ds = xr.open_dataset(store_target, chunks={}, engine="zarr") actual_encoding = actual_ds["da"].encoding["compressor"] assert actual_encoding.get_config() == compressor.get_config() assert_identical( - xr.open_dataset(store_target, chunks="auto", engine="zarr").compute(), + xr.open_dataset(store_target, chunks={}, engine="zarr").compute(), xr.concat([ds, ds_to_append], dim="time"), ) @@ -1961,7 +1961,7 @@ def test_append_with_new_variable(self): combined = xr.concat([ds, ds_to_append], dim="time") combined["new_var"] = ds_with_new_var["new_var"] assert_identical( - combined, xr.open_dataset(store_target, chunks="auto", engine="zarr") + combined, xr.open_dataset(store_target, chunks={}, engine="zarr") ) @requires_dask From 1d927140df5d47179e58b8c9bb69ce2304bab50b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 13 Nov 2020 15:57:56 +0100 Subject: [PATCH 04/21] deprecated None for chunks value in ds.chunks and var.chunks --- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 24 +++++++++++++++--------- xarray/core/variable.py | 15 ++++++++++----- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b95f681bc79..f65acca7f56 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1010,12 +1010,11 @@ def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: def chunk( self, chunks: Union[ - None, Number, Tuple[Number, ...], Tuple[Tuple[Number, ...], ...], Mapping[Hashable, Union[None, Number, Tuple[Number, ...]]], - ] = None, + ] = {}, name_prefix: str = "xarray-", token: str = None, lock: bool = False, @@ -1047,6 +1046,7 @@ def chunk( ------- chunked : xarray.DataArray """ + if isinstance(chunks, (tuple, list)): chunks = dict(zip(self.dims, chunks)) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cd127e1206d..5581df6fea1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1853,11 +1853,10 @@ def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: def chunk( self, chunks: Union[ - None, Number, str, Mapping[Hashable, Union[None, Number, str, Tuple[Number, ...]]], - ] = None, + ] = {}, name_prefix: str = "xarray-", token: str = None, lock: bool = False, @@ -1889,17 +1888,24 @@ def chunk( ------- chunked : xarray.Dataset """ + if chunks is None: + warnings.warn( + "None value for 'chunks' is deprecated. " + "It will raise an error in the future. Use instead '{}'", + category=FutureWarning, + stacklevel=2, + ) + chunks = {} if isinstance(chunks, (Number, str)): chunks = dict.fromkeys(self.dims, chunks) - if chunks is not None: - bad_dims = chunks.keys() - self.dims.keys() - if bad_dims: - raise ValueError( - "some chunks keys are not dimensions on this " - "object: %s" % bad_dims - ) + bad_dims = chunks.keys() - self.dims.keys() + if bad_dims: + raise ValueError( + "some chunks keys are not dimensions on this " + "object: %s" % bad_dims + ) variables = { k: _maybe_chunk(k, v, chunks, token, lock, name_prefix) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index a3876cb0077..5bf83b830e7 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -986,7 +986,7 @@ def chunks(self): _array_counter = itertools.count() - def chunk(self, chunks=None, name=None, lock=False): + def chunk(self, chunks={}, name=None, lock=False): """Coerce this array's data into a dask arrays with the given chunks. If this variable is a non-dask array, it will be converted to dask @@ -1015,13 +1015,18 @@ def chunk(self, chunks=None, name=None, lock=False): """ import dask import dask.array as da + if chunks is None: + warnings.warn( + "None value for 'chunks' is deprecated. " + "It will raise an error in the future. Use instead '{}'", + category=FutureWarning, + stacklevel=1 + ) + chunks = {} if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} - if chunks is None: - chunks = self.chunks or self.shape - data = self._data if is_duck_dask_array(data): data = data.rechunk(chunks) @@ -2368,7 +2373,7 @@ def values(self, values): f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate." ) - def chunk(self, chunks=None, name=None, lock=False): + def chunk(self, chunks={}, name=None, lock=False): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False) From e4180ba73ef114a8cbdae6ec923c998966adc10b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 18 Nov 2020 19:13:55 +0100 Subject: [PATCH 05/21] bugfix in get_chunks (wrong chunking when on-disk chunks not available) --- xarray/core/dataset.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5581df6fea1..24157592a50 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -360,17 +360,22 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: def _check_chunks_compatibility(dim, chunks, chunk_spec): - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if any(s % chunk_spec[dim] for s in spec): + if dim not in chunks or (dim not in chunk_spec): + return + + chunk_spec_dim = chunk_spec.get(dim) + chunks_dim = chunks.get(dim) + + if isinstance(chunks_dim, int): + chunks_dim = (chunks_dim,) + if any(s % chunk_spec_dim for s in chunks_dim): warnings.warn( "Specified Dask chunks %r would " "separate on disks chunk shape %r for " "dimension %r. This could " "degrades performance. Consider " - "rechunking after loading instead." % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, + "rechunking after loading instead." % (chunks_dim, chunk_spec_dim, dim), + stacklevel=3, ) @@ -384,12 +389,11 @@ def _get_chunk(name, var, chunks): output_chunks = {} if chunks is not None: - for dim in preferred_chunks: - if dim in chunks: - _check_chunks_compatibility(dim, chunks, preferred_chunks) - output_chunks[dim] = chunks[dim] - else: - output_chunks[dim] = preferred_chunks[dim] + for dim in var.dims: + _check_chunks_compatibility(dim, chunks, preferred_chunks) + preferred_chunks_dim = preferred_chunks.get(dim, None) + output_chunks[dim] = chunks.get(dim, preferred_chunks_dim) + return output_chunks From f4e485369504df70ac677391025bfc705c4b9c71 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 18 Nov 2020 20:49:12 +0100 Subject: [PATCH 06/21] bugfix - it uses backend preferred_chunking for inspiration in "auto" dask chunking (to align with ds.chunk that use the previuos chunking) - it uses check on IndexVariables to skip warnings instead of check on variable name and number of dimension --- xarray/backends/api.py | 2 +- xarray/backends/apiv2.py | 4 +-- xarray/core/dataset.py | 66 ++++++++++++++++++++++++---------------- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f1d58813958..5f9fc248e84 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -536,7 +536,7 @@ def maybe_decode_store(store, chunks): k: _maybe_chunk( k, v, - _get_chunk(k, v, chunks), + _get_chunk(v, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, ) for k, v in ds.variables.items() diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 3f43684b892..17dc648496e 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -34,12 +34,12 @@ def _chunk_ds( mtime = _get_mtime(filename_or_obj) token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) name_prefix = "open_dataset-%s" % token - if isinstance(chunks, int): + if isinstance(chunks, int) or (chunks == 'auto'): chunks = dict.fromkeys(backend_ds.dims, chunks) variables = {} for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(name, var, chunks) + var_chunks = _get_chunk(var, chunks) variables[name] = _maybe_chunk( name, var, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 24157592a50..097df45e0d1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -359,40 +359,52 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) -def _check_chunks_compatibility(dim, chunks, chunk_spec): - if dim not in chunks or (dim not in chunk_spec): - return +def _check_chunks_compatibility(var, chunks, chunk_spec): + for dim in var.dims: + if dim not in chunks or (dim not in chunk_spec): + return - chunk_spec_dim = chunk_spec.get(dim) - chunks_dim = chunks.get(dim) + chunk_spec_dim = chunk_spec.get(dim) + chunks_dim = chunks.get(dim) - if isinstance(chunks_dim, int): - chunks_dim = (chunks_dim,) - if any(s % chunk_spec_dim for s in chunks_dim): - warnings.warn( - "Specified Dask chunks %r would " - "separate on disks chunk shape %r for " - "dimension %r. This could " - "degrades performance. Consider " - "rechunking after loading instead." % (chunks_dim, chunk_spec_dim, dim), - stacklevel=3, - ) + if isinstance(chunks_dim, int): + chunks_dim = (chunks_dim,) + if any(s % chunk_spec_dim for s in chunks_dim): + warnings.warn( + "Specified Dask chunks %r would " + "separate on disks chunk shape %r for " + "dimension %r. This could " + "degrades performance. Consider " + "rechunking after loading instead." % (chunks_dim, chunk_spec_dim, dim), + stacklevel=3, + ) -def _get_chunk(name, var, chunks): - if chunks == "auto": - return chunks +def _get_chunk(var, chunks): + # chunks need to be explicity computed to take correctly into accout + # backend preferred chunking + import dask.array as da + preferred_chunks_list = var.encoding.get("chunks", {}) preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) - if var.ndim == 1 and var.dims[0] == name: - return preferred_chunks + if isinstance(var, IndexVariable): + return {} + + chunks_list = [] + for dim in var.dims: + chunks_dim = chunks.get(dim, None) + preferred_chunks_dim = preferred_chunks.get(dim, None) + chunks_list.append(chunks_dim or preferred_chunks_dim) + + output_chunks_list = da.core.normalize_chunks( + chunks_list, + shape=var.shape, + dtype=var.dtype, + previous_chunks=preferred_chunks_list + ) - output_chunks = {} - if chunks is not None: - for dim in var.dims: - _check_chunks_compatibility(dim, chunks, preferred_chunks) - preferred_chunks_dim = preferred_chunks.get(dim, None) - output_chunks[dim] = chunks.get(dim, preferred_chunks_dim) + output_chunks = dict(zip(var.dims, output_chunks_list)) + _check_chunks_compatibility(var, output_chunks, preferred_chunks) return output_chunks From 60d692396cc52c5e16b4204b358f70dad7a642ec Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 18 Nov 2020 23:45:55 +0100 Subject: [PATCH 07/21] add open_dataset zarr chunking tests --- xarray/core/dataset.py | 2 +- xarray/tests/test_backends.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 097df45e0d1..313d61f13f2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -376,7 +376,7 @@ def _check_chunks_compatibility(var, chunks, chunk_spec): "dimension %r. This could " "degrades performance. Consider " "rechunking after loading instead." % (chunks_dim, chunk_spec_dim, dim), - stacklevel=3, + stacklevel=2, ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c4b7cea8926..469e62cbdd2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4803,3 +4803,33 @@ def test_load_single_value_h5netcdf(tmp_path): ds.to_netcdf(tmp_path / "test.nc") with xr.open_dataset(tmp_path / "test.nc", engine="h5netcdf") as ds2: ds2["test"][0].load() + + +@requires_zarr +@requires_dask +@pytest.mark.parametrize("chunks", ["auto", -1, {}]) +def test_open_dataset_chunking_zarr(chunks, tmp_path): + encoded_chunks = 100 + ds = xr.Dataset( + { + 'test': xr.DataArray( + np.ones((500, 500), dtype="float64"), + dims=("x", "y"), + ) + } + ) + ds['test'].encoding['chunks'] = encoded_chunks + ds.to_zarr(tmp_path / "test.zarr") + + ds = ds.chunk(encoded_chunks) + dask_arr = ds['test'].data + + with dask.config.set({"array.chunk-size": "1MiB"}): + dask_arr_chunked = dask_arr.rechunk(chunks) + expected = dask_arr_chunked.chunks + + dataset_chunked = xr.open_dataset(tmp_path / "test.zarr", engine="zarr", chunks=chunks) + actual = dataset_chunked['test'].chunks + + assert actual == expected + From 74a52897a5c5a15cffa266c3dc5fa2ef034dd128 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 19 Nov 2020 10:17:25 +0100 Subject: [PATCH 08/21] add few tests in test_open_dataset_chunking_zarr --- xarray/tests/test_backends.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 469e62cbdd2..88bc5aa403b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4807,7 +4807,10 @@ def test_load_single_value_h5netcdf(tmp_path): @requires_zarr @requires_dask -@pytest.mark.parametrize("chunks", ["auto", -1, {}]) +@pytest.mark.parametrize( + "chunks", + ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}] +) def test_open_dataset_chunking_zarr(chunks, tmp_path): encoded_chunks = 100 ds = xr.Dataset( @@ -4825,7 +4828,13 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path): dask_arr = ds['test'].data with dask.config.set({"array.chunk-size": "1MiB"}): - dask_arr_chunked = dask_arr.rechunk(chunks) + xr2dask_key = {'x': 0, 'y': 1} + if isinstance(chunks, dict): + dask_chunks = {xr2dask_key[k]: chunks[k] for k in chunks} + else: + dask_chunks = chunks + + dask_arr_chunked = dask_arr.rechunk(dask_chunks) expected = dask_arr_chunked.chunks dataset_chunked = xr.open_dataset(tmp_path / "test.zarr", engine="zarr", chunks=chunks) From df4bf3b5aeec16036f630c003a4a64692e6ab21b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 19 Nov 2020 10:57:33 +0100 Subject: [PATCH 09/21] black isort --- xarray/backends/apiv2.py | 18 +++++++++--------- xarray/backends/zarr.py | 2 -- xarray/core/dataset.py | 5 ++--- xarray/core/variable.py | 1 + xarray/tests/test_backends.py | 18 +++++++++--------- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 17dc648496e..586e151f02c 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -34,7 +34,7 @@ def _chunk_ds( mtime = _get_mtime(filename_or_obj) token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) name_prefix = "open_dataset-%s" % token - if isinstance(chunks, int) or (chunks == 'auto'): + if isinstance(chunks, int) or (chunks == "auto"): chunks = dict.fromkeys(backend_ds.dims, chunks) variables = {} @@ -48,7 +48,7 @@ def _chunk_ds( name_prefix=name_prefix, token=token, ) - ds = backend_ds._replace(variables) + ds = backend_ds._replace(variables) ds._file_obj = backend_ds._file_obj return ds @@ -74,12 +74,12 @@ def dataset_from_backend_dataset( ds = backend_ds else: ds = _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, ) # Ensure source filename always stored in dataset object (GH issue #2550) @@ -260,7 +260,7 @@ def open_dataset( drop_variables=drop_variables, **decoders, **backend_kwargs, - **filtered_kwargs + **filtered_kwargs, ) ds = dataset_from_backend_dataset( backend_ds, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 97fe2b9b1fd..5cfe2d1b0bf 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np from .. import coding, conventions diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 313d61f13f2..90b94436b40 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -400,7 +400,7 @@ def _get_chunk(var, chunks): chunks_list, shape=var.shape, dtype=var.dtype, - previous_chunks=preferred_chunks_list + previous_chunks=preferred_chunks_list, ) output_chunks = dict(zip(var.dims, output_chunks_list)) @@ -1919,8 +1919,7 @@ def chunk( bad_dims = chunks.keys() - self.dims.keys() if bad_dims: raise ValueError( - "some chunks keys are not dimensions on this " - "object: %s" % bad_dims + "some chunks keys are not dimensions on this " "object: %s" % bad_dims ) variables = { diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 5bf83b830e7..ee993963843 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1015,6 +1015,7 @@ def chunk(self, chunks={}, name=None, lock=False): """ import dask import dask.array as da + if chunks is None: warnings.warn( "None value for 'chunks' is deprecated. " diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 88bc5aa403b..61b94c0d421 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4808,27 +4808,26 @@ def test_load_single_value_h5netcdf(tmp_path): @requires_zarr @requires_dask @pytest.mark.parametrize( - "chunks", - ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}] + "chunks", ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}] ) def test_open_dataset_chunking_zarr(chunks, tmp_path): encoded_chunks = 100 ds = xr.Dataset( { - 'test': xr.DataArray( + "test": xr.DataArray( np.ones((500, 500), dtype="float64"), dims=("x", "y"), ) } ) - ds['test'].encoding['chunks'] = encoded_chunks + ds["test"].encoding["chunks"] = encoded_chunks ds.to_zarr(tmp_path / "test.zarr") ds = ds.chunk(encoded_chunks) - dask_arr = ds['test'].data + dask_arr = ds["test"].data with dask.config.set({"array.chunk-size": "1MiB"}): - xr2dask_key = {'x': 0, 'y': 1} + xr2dask_key = {"x": 0, "y": 1} if isinstance(chunks, dict): dask_chunks = {xr2dask_key[k]: chunks[k] for k in chunks} else: @@ -4837,8 +4836,9 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path): dask_arr_chunked = dask_arr.rechunk(dask_chunks) expected = dask_arr_chunked.chunks - dataset_chunked = xr.open_dataset(tmp_path / "test.zarr", engine="zarr", chunks=chunks) - actual = dataset_chunked['test'].chunks + dataset_chunked = xr.open_dataset( + tmp_path / "test.zarr", engine="zarr", chunks=chunks + ) + actual = dataset_chunked["test"].chunks assert actual == expected - From 736ad78827ea4296bf25061cc69ab32dc774cdb0 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 19 Nov 2020 11:25:32 +0100 Subject: [PATCH 10/21] fix stacklevel in warnings --- xarray/core/dataarray.py | 1 - xarray/core/dataset.py | 1 - xarray/core/variable.py | 1 - 3 files changed, 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f65acca7f56..4fcef4beb7f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1046,7 +1046,6 @@ def chunk( ------- chunked : xarray.DataArray """ - if isinstance(chunks, (tuple, list)): chunks = dict(zip(self.dims, chunks)) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 90b94436b40..4d027378794 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1909,7 +1909,6 @@ def chunk( "None value for 'chunks' is deprecated. " "It will raise an error in the future. Use instead '{}'", category=FutureWarning, - stacklevel=2, ) chunks = {} diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ee993963843..a604a81cfa1 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1021,7 +1021,6 @@ def chunk(self, chunks={}, name=None, lock=False): "None value for 'chunks' is deprecated. " "It will raise an error in the future. Use instead '{}'", category=FutureWarning, - stacklevel=1 ) chunks = {} From b3ba50c40d805acadec74a82bc6c618bcae264c1 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 19 Nov 2020 15:07:21 +0100 Subject: [PATCH 11/21] fix: move piece of chunks computing from open_dataset in get_chunks --- xarray/backends/apiv2.py | 2 -- xarray/core/dataset.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 586e151f02c..3f162d899b4 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -34,8 +34,6 @@ def _chunk_ds( mtime = _get_mtime(filename_or_obj) token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) name_prefix = "open_dataset-%s" % token - if isinstance(chunks, int) or (chunks == "auto"): - chunks = dict.fromkeys(backend_ds.dims, chunks) variables = {} for name, var in backend_ds.variables.items(): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4d027378794..0f4102e5d89 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -384,6 +384,8 @@ def _get_chunk(var, chunks): # chunks need to be explicity computed to take correctly into accout # backend preferred chunking import dask.array as da + if isinstance(chunks, int) or (chunks == "auto"): + chunks = dict.fromkeys(var.dims, chunks) preferred_chunks_list = var.encoding.get("chunks", {}) preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) From 97a7bc135d27bcdc8fa01bb1e951153fc2f94259 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 19 Nov 2020 15:08:10 +0100 Subject: [PATCH 12/21] update tests chunking in test_backends.py --- xarray/tests/test_backends.py | 49 +++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 61b94c0d421..c8f8da7163c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4812,10 +4812,11 @@ def test_load_single_value_h5netcdf(tmp_path): ) def test_open_dataset_chunking_zarr(chunks, tmp_path): encoded_chunks = 100 + dask_arr = da.from_array(np.ones((500, 500), dtype="float64"), chunks=encoded_chunks) ds = xr.Dataset( { "test": xr.DataArray( - np.ones((500, 500), dtype="float64"), + dask_arr, dims=("x", "y"), ) } @@ -4823,22 +4824,42 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path): ds["test"].encoding["chunks"] = encoded_chunks ds.to_zarr(tmp_path / "test.zarr") - ds = ds.chunk(encoded_chunks) - dask_arr = ds["test"].data - with dask.config.set({"array.chunk-size": "1MiB"}): - xr2dask_key = {"x": 0, "y": 1} - if isinstance(chunks, dict): - dask_chunks = {xr2dask_key[k]: chunks[k] for k in chunks} - else: - dask_chunks = chunks + expected = ds.chunk(chunks) + actual = xr.open_dataset( + tmp_path / "test.zarr", engine="zarr", chunks=chunks + ) + assert actual == expected + - dask_arr_chunked = dask_arr.rechunk(dask_chunks) - expected = dask_arr_chunked.chunks +@requires_zarr +@requires_dask +@pytest.mark.parametrize( + "chunks", ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}] +) +def test_chunking_consintency(chunks, tmp_path): + encoded_chunks = {} + dask_arr = da.from_array(np.ones((500, 500), dtype="float64"), chunks=encoded_chunks) + ds = xr.Dataset( + { + "test": xr.DataArray( + dask_arr, + dims=("x", "y"), + ) + } + ) + ds["test"].encoding["chunks"] = encoded_chunks + ds.to_zarr(tmp_path / "test.zarr") + ds.to_netcdf(tmp_path / "test.nc") - dataset_chunked = xr.open_dataset( + with dask.config.set({"array.chunk-size": "1MiB"}): + expected = ds.chunk(chunks) + actual = xr.open_dataset( tmp_path / "test.zarr", engine="zarr", chunks=chunks ) - actual = dataset_chunked["test"].chunks + xr.testing.assert_chunks_equal(actual, expected) - assert actual == expected + actual = xr.open_dataset( + tmp_path / "test.nc", chunks=chunks + ) + xr.testing.assert_chunks_equal(actual, expected) From 48a7131b8631ece5857d051ae30c708f7c6681fa Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 19 Nov 2020 15:29:54 +0100 Subject: [PATCH 13/21] black --- xarray/core/dataset.py | 1 + xarray/tests/test_backends.py | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0f4102e5d89..39ee5325247 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -384,6 +384,7 @@ def _get_chunk(var, chunks): # chunks need to be explicity computed to take correctly into accout # backend preferred chunking import dask.array as da + if isinstance(chunks, int) or (chunks == "auto"): chunks = dict.fromkeys(var.dims, chunks) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c8f8da7163c..31df99b79c6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4812,7 +4812,9 @@ def test_load_single_value_h5netcdf(tmp_path): ) def test_open_dataset_chunking_zarr(chunks, tmp_path): encoded_chunks = 100 - dask_arr = da.from_array(np.ones((500, 500), dtype="float64"), chunks=encoded_chunks) + dask_arr = da.from_array( + np.ones((500, 500), dtype="float64"), chunks=encoded_chunks + ) ds = xr.Dataset( { "test": xr.DataArray( @@ -4826,9 +4828,7 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path): with dask.config.set({"array.chunk-size": "1MiB"}): expected = ds.chunk(chunks) - actual = xr.open_dataset( - tmp_path / "test.zarr", engine="zarr", chunks=chunks - ) + actual = xr.open_dataset(tmp_path / "test.zarr", engine="zarr", chunks=chunks) assert actual == expected @@ -4839,7 +4839,9 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path): ) def test_chunking_consintency(chunks, tmp_path): encoded_chunks = {} - dask_arr = da.from_array(np.ones((500, 500), dtype="float64"), chunks=encoded_chunks) + dask_arr = da.from_array( + np.ones((500, 500), dtype="float64"), chunks=encoded_chunks + ) ds = xr.Dataset( { "test": xr.DataArray( @@ -4854,12 +4856,8 @@ def test_chunking_consintency(chunks, tmp_path): with dask.config.set({"array.chunk-size": "1MiB"}): expected = ds.chunk(chunks) - actual = xr.open_dataset( - tmp_path / "test.zarr", engine="zarr", chunks=chunks - ) + actual = xr.open_dataset(tmp_path / "test.zarr", engine="zarr", chunks=chunks) xr.testing.assert_chunks_equal(actual, expected) - actual = xr.open_dataset( - tmp_path / "test.nc", chunks=chunks - ) + actual = xr.open_dataset(tmp_path / "test.nc", chunks=chunks) xr.testing.assert_chunks_equal(actual, expected) From 300b4e7ba4d26b243226917d1ebccaab621b1c00 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 25 Nov 2020 14:43:44 +0100 Subject: [PATCH 14/21] open_zarr behaviour is now unchanged --- xarray/backends/zarr.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5cfe2d1b0bf..c4c4d467a0b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -507,7 +507,7 @@ def open_zarr( store, group=None, synchronizer=None, - chunks={}, + chunks="auto", decode_cf=True, mask_and_scale=True, decode_times=True, @@ -611,6 +611,13 @@ def open_zarr( """ from .api import open_dataset + if chunks == "auto": + try: + import dask.array # noqa + chunks = {} + except ImportError: + chunks = None + if kwargs: raise TypeError( "open_zarr() got unexpected keyword arguments " + ",".join(kwargs.keys()) From 1bba7cf9e35b845bde01a430f2601b83eac0134f Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 25 Nov 2020 14:45:24 +0100 Subject: [PATCH 15/21] fix zarr test: remove "auto" replaced with None or {} --- xarray/tests/test_backends.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 31df99b79c6..6049272a266 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1567,7 +1567,7 @@ def roundtrip( if save_kwargs is None: save_kwargs = {} if open_kwargs is None: - open_kwargs = {"chunks": {}} + open_kwargs = {} with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: @@ -1851,7 +1851,7 @@ def test_write_persistence_modes(self, group): ds_to_append.to_zarr(store_target, append_dim="time", group=group) original = xr.concat([ds, ds_to_append], dim="time") actual = xr.open_dataset( - store_target, group=group, chunks={}, engine="zarr" + store_target, group=group, engine="zarr" ) assert_identical(original, actual) @@ -1941,11 +1941,11 @@ def test_check_encoding_is_consistent_after_append(self): encoding = {"da": {"compressor": compressor}} ds.to_zarr(store_target, mode="w", encoding=encoding) ds_to_append.to_zarr(store_target, append_dim="time") - actual_ds = xr.open_dataset(store_target, chunks={}, engine="zarr") + actual_ds = xr.open_dataset(store_target, engine="zarr") actual_encoding = actual_ds["da"].encoding["compressor"] assert actual_encoding.get_config() == compressor.get_config() assert_identical( - xr.open_dataset(store_target, chunks={}, engine="zarr").compute(), + xr.open_dataset(store_target, engine="zarr").compute(), xr.concat([ds, ds_to_append], dim="time"), ) @@ -1961,7 +1961,7 @@ def test_append_with_new_variable(self): combined = xr.concat([ds, ds_to_append], dim="time") combined["new_var"] = ds_with_new_var["new_var"] assert_identical( - combined, xr.open_dataset(store_target, chunks={}, engine="zarr") + combined, xr.open_dataset(store_target, engine="zarr") ) @requires_dask From df6fa7f9248312a5113a4809caccfb3037cf0d3b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 25 Nov 2020 14:46:50 +0100 Subject: [PATCH 16/21] black --- xarray/backends/zarr.py | 1 + xarray/tests/test_backends.py | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c4c4d467a0b..abf0c7372eb 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -614,6 +614,7 @@ def open_zarr( if chunks == "auto": try: import dask.array # noqa + chunks = {} except ImportError: chunks = None diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6049272a266..a556cea67f3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1850,9 +1850,7 @@ def test_write_persistence_modes(self, group): ds.to_zarr(store_target, mode="w", group=group) ds_to_append.to_zarr(store_target, append_dim="time", group=group) original = xr.concat([ds, ds_to_append], dim="time") - actual = xr.open_dataset( - store_target, group=group, engine="zarr" - ) + actual = xr.open_dataset(store_target, group=group, engine="zarr") assert_identical(original, actual) def test_compressor_encoding(self): @@ -1960,9 +1958,7 @@ def test_append_with_new_variable(self): ds_with_new_var.to_zarr(store_target, mode="a") combined = xr.concat([ds, ds_to_append], dim="time") combined["new_var"] = ds_with_new_var["new_var"] - assert_identical( - combined, xr.open_dataset(store_target, engine="zarr") - ) + assert_identical(combined, xr.open_dataset(store_target, engine="zarr")) @requires_dask def test_to_zarr_compute_false_roundtrip(self): From 44d38d603f2dd9873ed5a0719af5f0488d3393a6 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 11:39:46 +0100 Subject: [PATCH 17/21] fix test_vectorized_indexing in test_backends.py --- xarray/tests/test_backends.py | 51 ++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a556cea67f3..1d10e0720d8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -298,6 +298,18 @@ def open(self, path, **kwargs): with open_dataset(path, engine=self.engine, **kwargs) as ds: yield ds + def check_multiple_indexing(self, indexers, in_memory): + # make sure a sequence of lazy indexings certainly works. + with self.roundtrip(in_memory) as on_disk: + actual = on_disk["var3"] + expected = in_memory["var3"] + for ind in indexers: + actual = actual.isel(**ind) + expected = expected.isel(**ind) + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + def test_zero_dimensional_variable(self): expected = create_test_data() expected["float_var"] = ([], 1.0e9, {"units": "units of awesome"}) @@ -608,10 +620,6 @@ def test_orthogonal_indexing(self): actual = on_disk.isel(**indexers) assert_identical(expected, actual) - @pytest.mark.xfail( - not has_dask, - reason="the code for indexing without dask handles negative steps in slices incorrectly", - ) def test_vectorized_indexing(self): in_memory = create_test_data() with self.roundtrip(in_memory) as on_disk: @@ -629,18 +637,6 @@ def test_vectorized_indexing(self): actual = on_disk.isel(**indexers) assert_identical(expected, actual) - def multiple_indexing(indexers): - # make sure a sequence of lazy indexings certainly works. - with self.roundtrip(in_memory) as on_disk: - actual = on_disk["var3"] - expected = in_memory["var3"] - for ind in indexers: - actual = actual.isel(**ind) - expected = expected.isel(**ind) - # make sure the array is not yet loaded into memory - assert not actual.variable._in_memory - assert_identical(expected, actual.load()) - # two-staged vectorized-indexing indexers = [ { @@ -649,7 +645,7 @@ def multiple_indexing(indexers): }, {"a": DataArray([0, 1], dims=["c"]), "b": DataArray([0, 1], dims=["c"])}, ] - multiple_indexing(indexers) + self.check_multiple_indexing(indexers, in_memory) # vectorized-slice mixed indexers = [ @@ -658,7 +654,7 @@ def multiple_indexing(indexers): "dim3": slice(None, 10), } ] - multiple_indexing(indexers) + self.check_multiple_indexing(indexers, in_memory) # vectorized-integer mixed indexers = [ @@ -666,7 +662,7 @@ def multiple_indexing(indexers): {"dim1": DataArray([[0, 7], [2, 6], [3, 5]], dims=["a", "b"])}, {"a": slice(None, None, 2)}, ] - multiple_indexing(indexers) + self.check_multiple_indexing(indexers, in_memory) # vectorized-integer mixed indexers = [ @@ -674,7 +670,14 @@ def multiple_indexing(indexers): {"dim1": DataArray([[0, 7], [2, 6], [3, 5]], dims=["a", "b"])}, {"a": 1, "b": 0}, ] - multiple_indexing(indexers) + self.check_multiple_indexing(indexers, in_memory) + + @pytest.mark.xfail( + not has_dask, + reason="the code for indexing without dask handles negative steps in slices incorrectly", + ) + def test_vectorized_indexing_negative_step_slice(self): + in_memory = create_test_data() # with negative step slice. indexers = [ @@ -683,7 +686,7 @@ def multiple_indexing(indexers): "dim3": slice(-1, 1, -1), } ] - multiple_indexing(indexers) + self.check_multiple_indexing(indexers, in_memory) # with negative step slice. indexers = [ @@ -692,7 +695,7 @@ def multiple_indexing(indexers): "dim3": slice(-1, 1, -2), } ] - multiple_indexing(indexers) + self.check_multiple_indexing(indexers, in_memory) def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 @@ -2168,6 +2171,10 @@ def test_open_zarr_use_cftime(self): ds_b = xr.open_zarr(store_target, consolidated=True, use_cftime=True) assert xr.coding.times.contains_cftime_datetimes(ds_b.time) + @requires_dask + def test_vectorized_indexing_negative_step_slice(self): + super().test_vectorized_indexing_negative_step_slice + @requires_zarr class TestZarrDictStore(ZarrBase): From 6457b7477f11192cc497ec6805cdc21876095a06 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 11:43:01 +0100 Subject: [PATCH 18/21] removed not used xfails --- xarray/tests/test_backends.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1d10e0720d8..7fba8489cf7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -672,10 +672,6 @@ def test_vectorized_indexing(self): ] self.check_multiple_indexing(indexers, in_memory) - @pytest.mark.xfail( - not has_dask, - reason="the code for indexing without dask handles negative steps in slices incorrectly", - ) def test_vectorized_indexing_negative_step_slice(self): in_memory = create_test_data() From ce6f08fe54ee4f93a2b61ed51ad6ae0ee67414b0 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 15:44:47 +0100 Subject: [PATCH 19/21] reverted not necessary change --- xarray/tests/test_backends.py | 48 ++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7fba8489cf7..d9802458f21 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -298,18 +298,6 @@ def open(self, path, **kwargs): with open_dataset(path, engine=self.engine, **kwargs) as ds: yield ds - def check_multiple_indexing(self, indexers, in_memory): - # make sure a sequence of lazy indexings certainly works. - with self.roundtrip(in_memory) as on_disk: - actual = on_disk["var3"] - expected = in_memory["var3"] - for ind in indexers: - actual = actual.isel(**ind) - expected = expected.isel(**ind) - # make sure the array is not yet loaded into memory - assert not actual.variable._in_memory - assert_identical(expected, actual.load()) - def test_zero_dimensional_variable(self): expected = create_test_data() expected["float_var"] = ([], 1.0e9, {"units": "units of awesome"}) @@ -637,6 +625,18 @@ def test_vectorized_indexing(self): actual = on_disk.isel(**indexers) assert_identical(expected, actual) + def multiple_indexing(indexers): + # make sure a sequence of lazy indexings certainly works. + with self.roundtrip(in_memory) as on_disk: + actual = on_disk["var3"] + expected = in_memory["var3"] + for ind in indexers: + actual = actual.isel(**ind) + expected = expected.isel(**ind) + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + # two-staged vectorized-indexing indexers = [ { @@ -645,7 +645,7 @@ def test_vectorized_indexing(self): }, {"a": DataArray([0, 1], dims=["c"]), "b": DataArray([0, 1], dims=["c"])}, ] - self.check_multiple_indexing(indexers, in_memory) + multiple_indexing(indexers) # vectorized-slice mixed indexers = [ @@ -654,7 +654,7 @@ def test_vectorized_indexing(self): "dim3": slice(None, 10), } ] - self.check_multiple_indexing(indexers, in_memory) + multiple_indexing(indexers) # vectorized-integer mixed indexers = [ @@ -662,7 +662,7 @@ def test_vectorized_indexing(self): {"dim1": DataArray([[0, 7], [2, 6], [3, 5]], dims=["a", "b"])}, {"a": slice(None, None, 2)}, ] - self.check_multiple_indexing(indexers, in_memory) + multiple_indexing(indexers) # vectorized-integer mixed indexers = [ @@ -670,11 +670,23 @@ def test_vectorized_indexing(self): {"dim1": DataArray([[0, 7], [2, 6], [3, 5]], dims=["a", "b"])}, {"a": 1, "b": 0}, ] - self.check_multiple_indexing(indexers, in_memory) + multiple_indexing(indexers) def test_vectorized_indexing_negative_step_slice(self): in_memory = create_test_data() + def multiple_indexing(indexers): + # make sure a sequence of lazy indexings certainly works. + with self.roundtrip(in_memory) as on_disk: + actual = on_disk["var3"] + expected = in_memory["var3"] + for ind in indexers: + actual = actual.isel(**ind) + expected = expected.isel(**ind) + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + # with negative step slice. indexers = [ { @@ -682,7 +694,7 @@ def test_vectorized_indexing_negative_step_slice(self): "dim3": slice(-1, 1, -1), } ] - self.check_multiple_indexing(indexers, in_memory) + multiple_indexing(indexers) # with negative step slice. indexers = [ @@ -691,7 +703,7 @@ def test_vectorized_indexing_negative_step_slice(self): "dim3": slice(-1, 1, -2), } ] - self.check_multiple_indexing(indexers, in_memory) + multiple_indexing(indexers) def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 From dda2010f3244ae53e94841bf9f4cfbbec8f35e65 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Mon, 30 Nov 2020 19:21:04 +0100 Subject: [PATCH 20/21] Fix method override and enable dask for zarr --- xarray/tests/test_backends.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d9802458f21..8d8ead147d9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -672,12 +672,12 @@ def multiple_indexing(indexers): ] multiple_indexing(indexers) - def test_vectorized_indexing_negative_step_slice(self): + def test_vectorized_indexing_negative_step_slice(self, open_kwargs=None): in_memory = create_test_data() def multiple_indexing(indexers): # make sure a sequence of lazy indexings certainly works. - with self.roundtrip(in_memory) as on_disk: + with self.roundtrip(in_memory, open_kwargs=open_kwargs) as on_disk: actual = on_disk["var3"] expected = in_memory["var3"] for ind in indexers: @@ -2181,7 +2181,7 @@ def test_open_zarr_use_cftime(self): @requires_dask def test_vectorized_indexing_negative_step_slice(self): - super().test_vectorized_indexing_negative_step_slice + super().test_vectorized_indexing_negative_step_slice(open_kwargs={"chunks": {}}) @requires_zarr From e21820cabef71804c9335d0b54412051b627ce4e Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 1 Dec 2020 10:33:08 +0100 Subject: [PATCH 21/21] revert to chunks=None ds.chunk default. revert style fix --- xarray/core/dataarray.py | 3 ++- xarray/core/dataset.py | 24 ++++++++++-------------- xarray/core/variable.py | 15 +++++---------- 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4fcef4beb7f..b95f681bc79 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1010,11 +1010,12 @@ def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: def chunk( self, chunks: Union[ + None, Number, Tuple[Number, ...], Tuple[Tuple[Number, ...], ...], Mapping[Hashable, Union[None, Number, Tuple[Number, ...]]], - ] = {}, + ] = None, name_prefix: str = "xarray-", token: str = None, lock: bool = False, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 39ee5325247..269f58aca92 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -415,7 +415,7 @@ def _get_chunk(var, chunks): def _maybe_chunk( name, var, - chunks, + chunks=None, token=None, lock=None, name_prefix="xarray-", @@ -1872,10 +1872,11 @@ def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: def chunk( self, chunks: Union[ + None, Number, str, Mapping[Hashable, Union[None, Number, str, Tuple[Number, ...]]], - ] = {}, + ] = None, name_prefix: str = "xarray-", token: str = None, lock: bool = False, @@ -1907,22 +1908,17 @@ def chunk( ------- chunked : xarray.Dataset """ - if chunks is None: - warnings.warn( - "None value for 'chunks' is deprecated. " - "It will raise an error in the future. Use instead '{}'", - category=FutureWarning, - ) - chunks = {} if isinstance(chunks, (Number, str)): chunks = dict.fromkeys(self.dims, chunks) - bad_dims = chunks.keys() - self.dims.keys() - if bad_dims: - raise ValueError( - "some chunks keys are not dimensions on this " "object: %s" % bad_dims - ) + if chunks is not None: + bad_dims = chunks.keys() - self.dims.keys() + if bad_dims: + raise ValueError( + "some chunks keys are not dimensions on this " + "object: %s" % bad_dims + ) variables = { k: _maybe_chunk(k, v, chunks, token, lock, name_prefix) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index a604a81cfa1..a3876cb0077 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -986,7 +986,7 @@ def chunks(self): _array_counter = itertools.count() - def chunk(self, chunks={}, name=None, lock=False): + def chunk(self, chunks=None, name=None, lock=False): """Coerce this array's data into a dask arrays with the given chunks. If this variable is a non-dask array, it will be converted to dask @@ -1016,17 +1016,12 @@ def chunk(self, chunks={}, name=None, lock=False): import dask import dask.array as da - if chunks is None: - warnings.warn( - "None value for 'chunks' is deprecated. " - "It will raise an error in the future. Use instead '{}'", - category=FutureWarning, - ) - chunks = {} - if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} + if chunks is None: + chunks = self.chunks or self.shape + data = self._data if is_duck_dask_array(data): data = data.rechunk(chunks) @@ -2373,7 +2368,7 @@ def values(self, values): f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate." ) - def chunk(self, chunks={}, name=None, lock=False): + def chunk(self, chunks=None, name=None, lock=False): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False)