diff --git a/doc/api.rst b/doc/api.rst index 7fc18ae3bcc..9ab8617f0d6 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -46,6 +46,7 @@ Attributes Dataset.data_vars Dataset.coords Dataset.attrs + Dataset.encoding Dataset.indexes Dataset.get_index diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c8a046add97..992e7eb9fee 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -202,6 +202,10 @@ Enhancements and attributes. The method prints to a buffer (e.g. ``stdout``) with output similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`). By `Joe Hamman `_. +- Added the ability write unlimited netCDF dimensions with the ``scipy`` and + ``netcdf4`` backends via the new :py:attr:`~xray.Dataset.encoding` attribute + or via the ``unlimited_dims`` argument to :py:meth:`~xray.Dataset.to_netcdf`. + By `Joe Hamman `_. - New :py:meth:`~DataArray.quantile` method to calculate quantiles from DataArray objects (:issue:`1187`). By `Joe Hamman `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c69fc63acec..441bf08bb8e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -522,7 +522,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, def to_netcdf(dataset, path=None, mode='w', format=None, group=None, - engine=None, writer=None, encoding=None): + engine=None, writer=None, encoding=None, unlimited_dims=None): """This function creates an appropriate datastore for writing a dataset to disk as a netCDF file @@ -561,8 +561,12 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None, sync = writer is None store = store_cls(path, mode, format, group, writer) + + if unlimited_dims is None: + unlimited_dims = dataset.encoding.get('unlimited_dims', None) try: - dataset.dump_to_store(store, sync=sync, encoding=encoding) + dataset.dump_to_store(store, sync=sync, encoding=encoding, + unlimited_dims=unlimited_dims) if isinstance(path, BytesIO): return path.getvalue() finally: diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 3d71aff4407..e7cbd0bd9ae 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -5,7 +5,6 @@ import logging import time import traceback -import threading from collections import Mapping from distutils.version import StrictVersion @@ -84,6 +83,9 @@ def get_attrs(self): # pragma: no cover def get_variables(self): # pragma: no cover raise NotImplementedError + def get_encoding(self): + return {} + def load(self): """ This loads the variables and attributes simultaneously. @@ -105,8 +107,8 @@ def load(self): This function will be called anytime variables or attributes are requested, so care should be taken to make sure its fast. """ - variables = FrozenOrderedDict((_decode_variable_name(k), v) - for k, v in iteritems(self.get_variables())) + variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in + iteritems(self.get_variables())) attributes = FrozenOrderedDict(self.get_attrs()) return variables, attributes @@ -152,7 +154,11 @@ def add(self, source, target): self.sources.append(source) self.targets.append(target) else: - target[...] = source + try: + target[...] = source + except TypeError: + # workaround for GH: scipy/scipy#6880 + target[:] = source def sync(self): if self.sources: @@ -191,34 +197,42 @@ def store_dataset(self, dataset): # dataset.variables self.store(dataset, dataset.attrs) - def store(self, variables, attributes, check_encoding_set=frozenset()): + def store(self, variables, attributes, check_encoding_set=frozenset(), + unlimited_dims=None): self.set_attributes(attributes) - self.set_variables(variables, check_encoding_set) + self.set_variables(variables, check_encoding_set, + unlimited_dims=unlimited_dims) def set_attributes(self, attributes): for k, v in iteritems(attributes): self.set_attribute(k, v) - def set_variables(self, variables, check_encoding_set): + def set_variables(self, variables, check_encoding_set, + unlimited_dims=None): for vn, v in iteritems(variables): name = _encode_variable_name(vn) check = vn in check_encoding_set - target, source = self.prepare_variable(name, v, check) + target, source = self.prepare_variable( + name, v, check, unlimited_dims=unlimited_dims) self.writer.add(source, target) - def set_necessary_dimensions(self, variable): + def set_necessary_dimensions(self, variable, unlimited_dims=None): + if unlimited_dims is None: + unlimited_dims = set() for d, l in zip(variable.dims, variable.shape): if d not in self.dimensions: + if d in unlimited_dims: + l = None self.set_dimension(d, l) class WritableCFDataStore(AbstractWritableDataStore): - def store(self, variables, attributes, check_encoding_set=frozenset()): + def store(self, variables, attributes, *args, **kwargs): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. cf_variables, cf_attrs = cf_encoder(variables, attributes) AbstractWritableDataStore.store(self, cf_variables, cf_attrs, - check_encoding_set) + *args, **kwargs) class DataStorePickleMixin(object): diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 76582cfd72e..acb46bee14c 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function import functools +import warnings from .. import Variable from ..core import indexing @@ -9,8 +10,8 @@ from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict from .common import WritableCFDataStore, DataStorePickleMixin -from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding, - BaseNetCDF4Array) +from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, + _extract_nc4_variable_encoding, BaseNetCDF4Array) def maybe_decode_bytes(txt): @@ -33,7 +34,7 @@ def _read_attributes(h5netcdf_var): return attrs -_extract_h5nc_encoding = functools.partial(_extract_nc4_encoding, +_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding, lsd_okay=False, backend='h5netcdf') @@ -92,7 +93,8 @@ def set_dimension(self, name, length): def set_attribute(self, key, value): self.ds.setncattr(key, value) - def prepare_variable(self, name, variable, check_encoding=False): + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): import h5py attrs = variable.attrs.copy() @@ -100,7 +102,11 @@ def prepare_variable(self, name, variable, check_encoding=False): if dtype is str: dtype = h5py.special_dtype(vlen=unicode_type) - self.set_necessary_dimensions(variable) + if unlimited_dims is not None: + warnings.warn('h5netcdf does not support unlimited dimensions, ' + 'got: %s.' % unlimited_dims) + unlimited_dims = None + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) if fill_value in ['\x00']: diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py index bc5ab826672..f79e92439fe 100644 --- a/xarray/backends/memory.py +++ b/xarray/backends/memory.py @@ -29,7 +29,7 @@ def get_attrs(self): def get_variables(self): return self._variables - def prepare_variable(self, k, v, check_encoding=False): + def prepare_variable(self, k, v, *args, **kwargs): new_var = Variable(v.dims, np.empty_like(v), v.attrs) # we copy the variable and stuff all encodings in the # attributes to imitate what happens when writing to disk. diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 9b05a41925d..f72a26affaf 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -7,7 +7,7 @@ import numpy as np from .. import Variable -from ..conventions import pop_to, cf_encoder +from ..conventions import pop_to from ..core import indexing from ..core.utils import (FrozenOrderedDict, NDArrayMixin, close_on_error, is_remote_uri) @@ -138,13 +138,13 @@ def _force_native_endianness(var): # check to see if encoding has a value for endian its 'native' if not var.encoding.get('endian', 'native') is 'native': raise NotImplementedError("Attempt to write non-native endian type, " - "this is not supported by the netCDF4 python " - "library.") + "this is not supported by the netCDF4 " + "python library.") return var -def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True, - backend='netCDF4'): +def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, + lsd_okay=True, backend='netCDF4'): encoding = variable.encoding.copy() safe_to_drop = set(['source', 'original_shape']) @@ -154,9 +154,8 @@ def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True, valid_encodings.add('least_significant_digit') if (encoding.get('chunksizes') is not None and - (encoding.get('original_shape', variable.shape) - != variable.shape) and - not raise_on_invalid): + (encoding.get('original_shape', variable.shape) != + variable.shape) and not raise_on_invalid): del encoding['chunksizes'] for k in safe_to_drop: @@ -251,6 +250,12 @@ def get_dimensions(self): return FrozenOrderedDict((k, len(v)) for k, v in iteritems(self.ds.dimensions)) + def get_encoding(self): + encoding = {} + encoding['unlimited_dims'] = { + k for k, v in self.ds.dimensions.items() if v.isunlimited()} + return encoding + def set_dimension(self, name, length): self.ds.createDimension(name, size=length) @@ -259,7 +264,8 @@ def set_attribute(self, key, value): value = encode_nc3_attr_value(value) self.ds.setncattr(key, value) - def prepare_variable(self, name, variable, check_encoding=False): + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): attrs = variable.attrs.copy() variable = _force_native_endianness(variable) @@ -270,7 +276,7 @@ def prepare_variable(self, name, variable, check_encoding=False): variable = encode_nc3_variable(variable) datatype = variable.dtype - self.set_necessary_dimensions(variable) + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) if fill_value in ['', '\x00']: @@ -278,8 +284,8 @@ def prepare_variable(self, name, variable, check_encoding=False): # doesn't like setting fill_value to an empty string fill_value = None - encoding = _extract_nc4_encoding(variable, - raise_on_invalid=check_encoding) + encoding = _extract_nc4_variable_encoding( + variable, raise_on_invalid=check_encoding) nc4_var = self.ds.createVariable( varname=name, datatype=datatype, diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 075db5d4ccb..8bb759503b9 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -57,5 +57,11 @@ def get_attrs(self): def get_dimensions(self): return Frozen(self.ds.dimensions) + def get_encoding(self): + encoding = {} + encoding['unlimited_dims'] = set( + [k for k in self.ds.dimensions if self.ds.unlimited(k)]) + return encoding + def close(self): self.ds.close() diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 0113728f81c..281d23fa40a 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -8,7 +8,7 @@ import warnings from .. import Variable -from ..core.pycompat import iteritems, basestring, OrderedDict +from ..core.pycompat import iteritems, OrderedDict from ..core.utils import Frozen, FrozenOrderedDict from ..core.indexing import NumpyIndexingAdapter @@ -119,6 +119,12 @@ def get_attrs(self): def get_dimensions(self): return Frozen(self.ds.dimensions) + def get_encoding(self): + encoding = {} + encoding['unlimited_dims'] = { + k for k, v in self.ds.dimensions.items() if v is None} + return encoding + def set_dimension(self, name, length): if name in self.dimensions: raise ValueError('%s does not support modifying dimensions' @@ -134,13 +140,17 @@ def set_attribute(self, key, value): value = encode_nc3_attr_value(value) setattr(self.ds, key, value) - def prepare_variable(self, name, variable, check_encoding=False): + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): variable = encode_nc3_variable(variable) if check_encoding and variable.encoding: raise ValueError('unexpected encoding for scipy backend: %r' % list(variable.encoding)) - self.set_necessary_dimensions(variable) + if unlimited_dims is not None and len(unlimited_dims) > 1: + raise ValueError('NETCDF3 only supports one unlimited dimension') + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) + data = variable.data # nb. this still creates a numpy array in all memory, even though we # don't write the data yet; scipy.io.netcdf does not not support diff --git a/xarray/conventions.py b/xarray/conventions.py index c10857041bd..178ee8442db 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -937,10 +937,12 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, attrs = obj.attrs extra_coords = set(obj.coords) file_obj = obj._file_obj + encoding = obj.encoding elif isinstance(obj, AbstractDataStore): vars, attrs = obj.load() extra_coords = set() file_obj = obj + encoding = obj.get_encoding() else: raise TypeError('can only decode Dataset or DataStore objects') @@ -950,6 +952,8 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj + ds.encoding = encoding + return ds diff --git a/xarray/core/common.py b/xarray/core/common.py index a18a8711e7f..f81a8bde460 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -4,8 +4,7 @@ import numpy as np import pandas as pd -from .pycompat import (basestring, iteritems, suppress, dask_array_type, - OrderedDict) +from .pycompat import (basestring, suppress, dask_array_type, OrderedDict) from . import formatting from .utils import SortedKeysDict, not_implemented, Frozen @@ -751,7 +750,8 @@ def full_like(other, fill_value, dtype=None): elif isinstance(other, DataArray): return DataArray( _full_like_variable(other.variable, fill_value, dtype), - dims=other.dims, coords=other.coords, attrs=other.attrs, name=other.name) + dims=other.dims, coords=other.coords, attrs=other.attrs, + name=other.name) elif isinstance(other, Variable): return _full_like_variable(other, fill_value, dtype) else: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9555537c3e8..12d8f6c3565 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -357,6 +357,7 @@ def __init__(self, data_vars=None, coords=None, attrs=None, self._set_init_vars_and_dims(data_vars, coords, compat) if attrs is not None: self.attrs = attrs + self._encoding = None self._initialized = True def _set_init_vars_and_dims(self, data_vars, coords, compat): @@ -408,6 +409,18 @@ def attrs(self): def attrs(self, value): self._attrs = OrderedDict(value) + @property + def encoding(self): + """Dictionary of global encoding attributes on this dataset + """ + if self._encoding is None: + self._encoding = {} + return self._encoding + + @encoding.setter + def encoding(self, value): + self._encoding = dict(value) + @property def dims(self): """Mapping from dimension names to lengths. @@ -478,7 +491,7 @@ def compute(self): @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, - file_obj=None): + file_obj=None, encoding=None): """Shortcut around __init__ for internal use when we want to skip costly validation """ @@ -488,6 +501,7 @@ def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, obj._dims = dims obj._attrs = attrs obj._file_obj = file_obj + obj._encoding = encoding obj._initialized = True return obj @@ -855,7 +869,8 @@ def reset_coords(self, names=None, drop=False, inplace=False): del obj._variables[name] return obj - def dump_to_store(self, store, encoder=None, sync=True, encoding=None): + def dump_to_store(self, store, encoder=None, sync=True, encoding=None, + unlimited_dims=None): """Store dataset contents to a backends.*DataStore object.""" if encoding is None: encoding = {} @@ -871,12 +886,13 @@ def dump_to_store(self, store, encoder=None, sync=True, encoding=None): if encoder: variables, attrs = encoder(variables, attrs) - store.store(variables, attrs, check_encoding) + store.store(variables, attrs, check_encoding, + unlimited_dims=unlimited_dims) if sync: store.sync() def to_netcdf(self, path=None, mode='w', format=None, group=None, - engine=None, encoding=None): + engine=None, encoding=None, unlimited_dims=None): """Write dataset contents to a netCDF file. Parameters @@ -920,12 +936,18 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}`` + unlimited_dims : sequence of str, optional + Dimension(s) that should be serialized as unlimited dimensions. + By default, no dimensions are treated as unlimited dimensions. + Note that unlimited_dims may also be set via + ``dataset.encoding['unlimited_dims']``. """ if encoding is None: encoding = {} from ..backends.api import to_netcdf return to_netcdf(self, path, mode, format=format, group=group, - engine=engine, encoding=encoding) + engine=engine, encoding=encoding, + unlimited_dims=unlimited_dims) def __unicode__(self): return formatting.dataset_repr(self) @@ -1351,7 +1373,8 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): return self.reindex(method=method, copy=copy, tolerance=tolerance, **indexers) - def reindex(self, indexers=None, method=None, tolerance=None, copy=True, **kw_indexers): + def reindex(self, indexers=None, method=None, tolerance=None, copy=True, + **kw_indexers): """Conform this object onto a new set of indexes, filling in missing values with NaN. diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index c7867dc6d3a..897a283c847 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -20,7 +20,7 @@ from xarray import (Dataset, DataArray, open_dataset, open_dataarray, open_mfdataset, backends, save_mfdataset) from xarray.backends.common import robust_getitem -from xarray.backends.netCDF4_ import _extract_nc4_encoding +from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding from xarray.core import indexing from xarray.core.pycompat import iteritems, PY2, PY3 @@ -920,6 +920,17 @@ def test_cross_engine_read_write_netcdf3(self): [assert_xarray_allclose(data[k].variable, actual[k].variable) for k in data] + def test_encoding_unlimited_dims(self): + ds = Dataset({'x': ('y', np.arange(10.0))}) + with self.roundtrip(ds, + save_kwargs=dict(unlimited_dims=['y'])) as actual: + self.assertEqual(actual.encoding['unlimited_dims'], set('y')) + self.assertDatasetEqual(ds, actual) + ds.encoding = {'unlimited_dims': ['y']} + with self.roundtrip(ds) as actual: + self.assertEqual(actual.encoding['unlimited_dims'], set('y')) + self.assertDatasetEqual(ds, actual) + @requires_h5netcdf @requires_netCDF4 @@ -972,6 +983,15 @@ def test_read_byte_attrs_as_unicode(self): expected = Dataset(attrs={'foo': 'bar'}) self.assertDatasetIdentical(expected, actual) + def test_encoding_unlimited_dims(self): + ds = Dataset({'x': ('y', np.arange(10.0))}) + ds.encoding = {'unlimited_dims': ['y']} + with create_tmp_file() as tmp_file: + with pytest.warns(UserWarning): + ds.to_netcdf(tmp_file, engine='h5netcdf') + with pytest.warns(UserWarning): + ds.to_netcdf(tmp_file, engine='h5netcdf', unlimited_dims=['y']) + @requires_dask @requires_scipy @@ -1194,13 +1214,13 @@ def test_weakrefs(self): class TestEncodingInvalid(TestCase): - def test_extract_nc4_encoding(self): + def test_extract_nc4_variable_encoding(self): var = xr.Variable(('x',), [1, 2, 3], {}, {'foo': 'bar'}) with self.assertRaisesRegexp(ValueError, 'unexpected encoding'): - _extract_nc4_encoding(var, raise_on_invalid=True) + _extract_nc4_variable_encoding(var, raise_on_invalid=True) var = xr.Variable(('x',), [1, 2, 3], {}, {'chunking': (2, 1)}) - encoding = _extract_nc4_encoding(var) + encoding = _extract_nc4_variable_encoding(var) self.assertEqual({}, encoding) def test_extract_h5nc_encoding(self): @@ -1208,11 +1228,13 @@ def test_extract_h5nc_encoding(self): var = xr.Variable(('x',), [1, 2, 3], {}, {'least_sigificant_digit': 2}) with self.assertRaisesRegexp(ValueError, 'unexpected encoding'): - _extract_nc4_encoding(var, raise_on_invalid=True) + _extract_nc4_variable_encoding(var, raise_on_invalid=True) + class MiscObject: pass + @requires_netCDF4 class TestValidateAttrs(TestCase): def test_validating_attrs(self): @@ -1312,6 +1334,7 @@ def new_dataset_and_coord_attrs(): with create_tmp_file() as tmp_file: ds.to_netcdf(tmp_file) + @requires_netCDF4 class TestDataArrayToNetCDF(TestCase): @@ -1324,7 +1347,6 @@ def test_dataarray_to_netcdf_no_name(self): with open_dataarray(tmp) as loaded_da: self.assertDataArrayIdentical(original_da, loaded_da) - def test_dataarray_to_netcdf_with_name(self): original_da = DataArray(np.arange(12).reshape((3, 4)), name='test') @@ -1335,7 +1357,6 @@ def test_dataarray_to_netcdf_with_name(self): with open_dataarray(tmp) as loaded_da: self.assertDataArrayIdentical(original_da, loaded_da) - def test_dataarray_to_netcdf_coord_name_clash(self): original_da = DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 9a6e1c874b8..cc4e6abafb4 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -61,9 +61,9 @@ def __init__(self, writer=None): super(InaccessibleVariableDataStore, self).__init__(writer) self._indexvars = set() - def store(self, variables, attributes, check_encoding_set=frozenset()): + def store(self, variables, *args, **kwargs): super(InaccessibleVariableDataStore, self).store( - variables, attributes, check_encoding_set) + variables, *args, **kwargs) for k, v in variables.items(): if isinstance(v, IndexVariable): self._indexvars.add(k)