From 3cfd1af0de1ea55f7dabbb0f5e397d7d10116139 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 5 Oct 2020 15:24:32 +0200 Subject: [PATCH 1/3] Handle scale_factor and add_offset as scalar The h5netcdf engine exposes single-valued attributes as arrays of shape (1,), which is correct according to the NetCDF standard, but may cause a problem when reading a value of shape () before the scale_factor and add_offset have been applied. This PR adds a check for the dimensionality of add_offset and scale_factor and ensures they are scalar before they are used for further processing, adds a unit test to verify that this works correctly, and a note to the documentation to warn users of this difference between the h5netcdf and netcdf4 engines. Fixes #4471. --- doc/io.rst | 6 ++++++ xarray/coding/variables.py | 4 ++++ xarray/tests/test_backends.py | 21 +++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/doc/io.rst b/doc/io.rst index 956d9394653..b3f5f759f33 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -105,6 +105,12 @@ Dataset and DataArray objects, and no array values are loaded into memory until you try to perform some sort of actual computation. For an example of how these lazy arrays work, see the OPeNDAP section below. +There may be minor differences in the :py:class:`Dataset` object returned +when reading a NetCDF file with different engines. For example, +single-valued attributes are returned as scalars by the default +``engine=netcdf4``, but as arrays of size ``(1,)`` when reading with +``engine=h5netcdf``. + It is important to note that when you modify values of a Dataset, even one linked to files on disk, only the in-memory copy you are manipulating in xarray is modified: the original file on disk is never touched. diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 92e454846f4..80277e9cd4e 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -269,6 +269,10 @@ def decode(self, variable, name=None): scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) dtype = _choose_float_dtype(data.dtype, "add_offset" in attrs) + if np.ndim(scale_factor) > 0: + scale_factor = scale_factor.item() + if np.ndim(add_offset) > 0: + add_offset = add_offset.item() transform = partial( _scale_offset_decoding, scale_factor=scale_factor, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 81664737330..31ebcd35a1b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4668,3 +4668,24 @@ def test_extract_zarr_variable_encoding(): actual = backends.zarr.extract_zarr_variable_encoding( var, raise_on_invalid=True ) + + +@requires_h5netcdf +def test_load_single_value_h5netcdf(tmp_path): + """Test that numeric single-element vector attributes are handled fine. + + At present (h5netcdf v0.8.1), the h5netcdf exposes single-valued numeric variable + attributes as arrays of length 1, as oppesed to scalars for the NetCDF4 + backend. This was leading to a ValueError upon loading a single value from + a file, see #4471. Test that loading causes no failure. + """ + ds = xr.Dataset( + { + "test": xr.DataArray( + np.array([0]), dims=("x",), attrs={"scale_factor": 1, "add_offset": 0} + ) + } + ) + ds.to_netcdf(tmp_path / "test.nc") + with xr.open_dataset(tmp_path / "test.nc", engine="h5netcdf") as ds2: + ds2["test"][0].load() From 75e9adc3ba72e172847c97f34a8fcc4a92acf726 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Tue, 6 Oct 2020 11:22:03 +0200 Subject: [PATCH 2/3] DOC: Add whats-new entry for fixing 4471 Add a whats-new entry for the fix to issue #4471, corresponding to PR #4485. --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6b944fd1b30..448ab1355b0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,8 @@ New Features Bug fixes ~~~~~~~~~ +- Fix bug where reading a scalar value from a NetCDF file opened with the ``h5netcdf`` backend would raise a ``ValueError`` when ``decode_cf=True`` (:issue:`4471`, :pull:`4485`). + By `Gerrit Holl `_. - Fix bug where datetime64 times are silently changed to incorrect values if they are outside the valid date range for ns precision when provided in some other units (:issue:`4427`, :pull:`4454`). By `Andrew Pauling `_ - Fix silently overwriting the `engine` key when passing :py:func:`open_dataset` a file object From 49d03d246ce657e0cd3be0582334a1a023c4b374 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 11 Oct 2020 12:26:52 -0600 Subject: [PATCH 3/3] Update doc/io.rst Co-authored-by: Mathias Hauser --- doc/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/io.rst b/doc/io.rst index b3f5f759f33..db6ee1ba3d0 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -106,7 +106,7 @@ you try to perform some sort of actual computation. For an example of how these lazy arrays work, see the OPeNDAP section below. There may be minor differences in the :py:class:`Dataset` object returned -when reading a NetCDF file with different engines. For example, +when reading a NetCDF file with different engines. For example, single-valued attributes are returned as scalars by the default ``engine=netcdf4``, but as arrays of size ``(1,)`` when reading with ``engine=h5netcdf``.