From 271a75115d20df6cb06c576255d7bb263102d4c6 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 16 Dec 2016 17:03:34 -0800 Subject: [PATCH 01/13] initial hack at enabling unlimited dims in to_netcdf --- doc/whats-new.rst | 4 ++++ xarray/backends/common.py | 11 +++++----- xarray/backends/h5netcdf_.py | 6 +++--- xarray/backends/netCDF4_.py | 41 ++++++++++++++++++++++++++---------- xarray/backends/scipy_.py | 13 ++++++++++-- xarray/conventions.py | 1 + xarray/core/common.py | 6 +++--- xarray/core/dataset.py | 25 ++++++++++++++++++---- xarray/test/test_backends.py | 25 ++++++++++++++++------ 9 files changed, 97 insertions(+), 35 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e4c7520e8e4..ca08de04a76 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -144,6 +144,10 @@ Enhancements sharing can be disabled for polar plots. By `Bas Hoonhout `_. +- Added the ability write unlimited netCDF dimensions with the ``netcdf4`` + backend. + By `Joe Hamman `_. + Bug fixes ~~~~~~~~~ - ``groupby_bins`` now restores empty bins by default (:issue:`1019`). diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 27291e65e3a..d8eda48b943 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -2,7 +2,6 @@ from __future__ import division from __future__ import print_function import numpy as np -import itertools import logging import time import traceback @@ -12,7 +11,7 @@ from ..conventions import cf_encoder from ..core.utils import FrozenOrderedDict -from ..core.pycompat import iteritems, dask_array_type, OrderedDict +from ..core.pycompat import iteritems, dask_array_type # Create a logger object, but don't add any handlers. Leave that to user code. logger = logging.getLogger(__name__) @@ -96,8 +95,8 @@ def load(self): This function will be called anytime variables or attributes are requested, so care should be taken to make sure its fast. """ - variables = FrozenOrderedDict((_decode_variable_name(k), v) - for k, v in iteritems(self.get_variables())) + variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in + iteritems(self.get_variables())) attributes = FrozenOrderedDict(self.get_attrs()) return variables, attributes @@ -197,9 +196,11 @@ def set_variables(self, variables, check_encoding_set): target, source = self.prepare_variable(name, v, check) self.writer.add(source, target) - def set_necessary_dimensions(self, variable): + def set_necessary_dimensions(self, variable, unlimited_dims=set()): for d, l in zip(variable.dims, variable.shape): if d not in self.dimensions: + if d in unlimited_dims: + l = None self.set_dimension(d, l) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 76582cfd72e..0787240c85b 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -9,8 +9,8 @@ from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict from .common import WritableCFDataStore, DataStorePickleMixin -from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding, - BaseNetCDF4Array) +from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, + _extract_nc4_variable_encoding, BaseNetCDF4Array) def maybe_decode_bytes(txt): @@ -33,7 +33,7 @@ def _read_attributes(h5netcdf_var): return attrs -_extract_h5nc_encoding = functools.partial(_extract_nc4_encoding, +_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding, lsd_okay=False, backend='h5netcdf') diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 9b05a41925d..346573d5f5e 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -7,7 +7,7 @@ import numpy as np from .. import Variable -from ..conventions import pop_to, cf_encoder +from ..conventions import pop_to from ..core import indexing from ..core.utils import (FrozenOrderedDict, NDArrayMixin, close_on_error, is_remote_uri) @@ -138,13 +138,22 @@ def _force_native_endianness(var): # check to see if encoding has a value for endian its 'native' if not var.encoding.get('endian', 'native') is 'native': raise NotImplementedError("Attempt to write non-native endian type, " - "this is not supported by the netCDF4 python " - "library.") + "this is not supported by the netCDF4 " + "python library.") return var -def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True, - backend='netCDF4'): +def _extract_nc4_dataset_encoding(ds): + import netCDF4 as nc4 + + encoding = {} + encoding['unlimited_dims'] = set( + [k for k, v in ds.dimensions if nc4.isunlimited(v)]) + return encoding + + +def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, + lsd_okay=True, backend='netCDF4'): encoding = variable.encoding.copy() safe_to_drop = set(['source', 'original_shape']) @@ -154,9 +163,8 @@ def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True, valid_encodings.add('least_significant_digit') if (encoding.get('chunksizes') is not None and - (encoding.get('original_shape', variable.shape) - != variable.shape) and - not raise_on_invalid): + (encoding.get('original_shape', variable.shape) != + variable.shape) and not raise_on_invalid): del encoding['chunksizes'] for k in safe_to_drop: @@ -209,6 +217,8 @@ def __init__(self, filename, mode='r', format='NETCDF4', group=None, self._opener = opener self._filename = filename self._mode = 'a' if mode == 'w' else mode + self._unlimited_dimensions = set() + self.encoding = None super(NetCDF4DataStore, self).__init__(writer) def open_store_variable(self, name, var): @@ -248,9 +258,17 @@ def get_attrs(self): for k in self.ds.ncattrs()) def get_dimensions(self): + # TODO: these can be combined somehow to avoid iterating throw + # dimensions twice + self._unlimited_dimensions = self._get_unlimited_dimensions() return FrozenOrderedDict((k, len(v)) for k, v in iteritems(self.ds.dimensions)) + def _get_unlimited_dimensions(self): + import netCDF4 as nc4 + return set( + k for k, v in iteritems(self.ds.dimensions) if nc4.isunlimited(v)) + def set_dimension(self, name, length): self.ds.createDimension(name, size=length) @@ -270,7 +288,8 @@ def prepare_variable(self, name, variable, check_encoding=False): variable = encode_nc3_variable(variable) datatype = variable.dtype - self.set_necessary_dimensions(variable) + self.set_necessary_dimensions( + variable, unlimited_dims=self._unlimited_dimensions) fill_value = attrs.pop('_FillValue', None) if fill_value in ['', '\x00']: @@ -278,8 +297,8 @@ def prepare_variable(self, name, variable, check_encoding=False): # doesn't like setting fill_value to an empty string fill_value = None - encoding = _extract_nc4_encoding(variable, - raise_on_invalid=check_encoding) + encoding = _extract_nc4_variable_encoding( + variable, raise_on_invalid=check_encoding) nc4_var = self.ds.createVariable( varname=name, datatype=datatype, diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 0113728f81c..24233299d82 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -8,7 +8,7 @@ import warnings from .. import Variable -from ..core.pycompat import iteritems, basestring, OrderedDict +from ..core.pycompat import iteritems, OrderedDict from ..core.utils import Frozen, FrozenOrderedDict from ..core.indexing import NumpyIndexingAdapter @@ -105,6 +105,9 @@ def __init__(self, filename_or_obj, mode='r', format=None, group=None, super(ScipyDataStore, self).__init__(writer) + self._unlimited_dimensions = set() + self.encoding = {} + def open_store_variable(self, name, var): return Variable(var.dimensions, ScipyArrayWrapper(name, self), _decode_attrs(var._attributes)) @@ -116,7 +119,11 @@ def get_variables(self): def get_attrs(self): return Frozen(_decode_attrs(self.ds._attributes)) + def _get_unlimited_dimensions(self): + return set(k for k, v in iteritems(self.ds.dimensions) if v is None) + def get_dimensions(self): + self._unlimited_dimensions = self._get_unlimited_dimensions() return Frozen(self.ds.dimensions) def set_dimension(self, name, length): @@ -140,7 +147,9 @@ def prepare_variable(self, name, variable, check_encoding=False): raise ValueError('unexpected encoding for scipy backend: %r' % list(variable.encoding)) - self.set_necessary_dimensions(variable) + self.set_necessary_dimensions( + variable, unlimited_dims=self._unlimited_dimensions) + data = variable.data # nb. this still creates a numpy array in all memory, even though we # don't write the data yet; scipy.io.netcdf does not not support diff --git a/xarray/conventions.py b/xarray/conventions.py index c10857041bd..c93073bf359 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -950,6 +950,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj + ds.encoding = obj.encoding return ds diff --git a/xarray/core/common.py b/xarray/core/common.py index a18a8711e7f..f81a8bde460 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -4,8 +4,7 @@ import numpy as np import pandas as pd -from .pycompat import (basestring, iteritems, suppress, dask_array_type, - OrderedDict) +from .pycompat import (basestring, suppress, dask_array_type, OrderedDict) from . import formatting from .utils import SortedKeysDict, not_implemented, Frozen @@ -751,7 +750,8 @@ def full_like(other, fill_value, dtype=None): elif isinstance(other, DataArray): return DataArray( _full_like_variable(other.variable, fill_value, dtype), - dims=other.dims, coords=other.coords, attrs=other.attrs, name=other.name) + dims=other.dims, coords=other.coords, attrs=other.attrs, + name=other.name) elif isinstance(other, Variable): return _full_like_variable(other, fill_value, dtype) else: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9a3e2117d80..2f367d65dc1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -22,7 +22,8 @@ merge_data_and_coords) from .utils import (Frozen, SortedKeysDict, maybe_wrap_array, hashable, decode_numpy_dict_values, ensure_us_time_resolution) -from .variable import (Variable, as_variable, IndexVariable, broadcast_variables) +from .variable import (Variable, as_variable, IndexVariable, + broadcast_variables) from .pycompat import (iteritems, basestring, OrderedDict, dask_array_type, range) from .combine import concat @@ -184,7 +185,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, BaseDataObject, groupby_cls = groupby.DatasetGroupBy def __init__(self, data_vars=None, coords=None, attrs=None, - compat='broadcast_equals'): + compat='broadcast_equals', encoding=None): """To load data from a file or file-like object, use the `open_dataset` function. @@ -222,6 +223,7 @@ def __init__(self, data_vars=None, coords=None, attrs=None, self._coord_names = set() self._dims = {} self._attrs = None + self._encoding = None self._file_obj = None if data_vars is None: data_vars = {} @@ -232,6 +234,8 @@ def __init__(self, data_vars=None, coords=None, attrs=None, if attrs is not None: self.attrs = attrs self._initialized = True + if encoding is not None: + self.encoding = encoding def _set_init_vars_and_dims(self, data_vars, coords, compat): """Set the initial value of Dataset variables and dimensions @@ -282,6 +286,18 @@ def attrs(self): def attrs(self, value): self._attrs = OrderedDict(value) + @property + def encoding(self): + """Dictionary of global encoding attributes on this dataset + """ + if self._encoding is None: + self._encoding = OrderedDict() + return self._encoding + + @encoding.setter + def encoding(self, value): + self._encoding = OrderedDict(value) + @property def dims(self): """Mapping from dimension names to lengths. @@ -519,7 +535,7 @@ def __deepcopy__(self, memo=None): @property def _attr_sources(self): """List of places to look-up items for attribute-style access""" - return [self, LevelCoordinatesSource(self), self.attrs] + return [self, LevelCoordinatesSource(self), self.attrs, self.encoding] def __contains__(self, key): """The 'in' operator will return true or false depending on whether @@ -1186,7 +1202,8 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): return self.reindex(method=method, copy=copy, tolerance=tolerance, **indexers) - def reindex(self, indexers=None, method=None, tolerance=None, copy=True, **kw_indexers): + def reindex(self, indexers=None, method=None, tolerance=None, copy=True, + **kw_indexers): """Conform this object onto a new set of indexes, filling in missing values with NaN. diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 82001fffb83..a123e9a28c4 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -20,7 +20,7 @@ from xarray import (Dataset, DataArray, open_dataset, open_dataarray, open_mfdataset, backends, save_mfdataset) from xarray.backends.common import robust_getitem -from xarray.backends.netCDF4_ import _extract_nc4_encoding +from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding from xarray.core import indexing from xarray.core.pycompat import iteritems, PY2, PY3 @@ -736,6 +736,16 @@ def test_unsorted_index_raises(self): except IndexError as err: self.assertIn('first by calling .load', str(err)) + def test_encoding_unlimited_dims(self): + ds = Dataset({'x': ('y', np.arange(10.0))}) + # also test for netcdf3 + kwargs = dict(format='NETCDF4', + encoding={'y': {'unlimited_dims': ['y']}}) + ds.to_netcdf('test.nc', **kwargs) + with self.roundtrip(ds, save_kwargs=kwargs) as actual: + self.assertEqual(actual.x.encoding['unlimited_dims'], set('y')) + self.assertEqual(ds.x.encoding, {}) + @requires_netCDF4 @requires_dask @@ -1202,13 +1212,13 @@ def test_weakrefs(self): class TestEncodingInvalid(TestCase): - def test_extract_nc4_encoding(self): + def test_extract_nc4_variable_encoding(self): var = xr.Variable(('x',), [1, 2, 3], {}, {'foo': 'bar'}) with self.assertRaisesRegexp(ValueError, 'unexpected encoding'): - _extract_nc4_encoding(var, raise_on_invalid=True) + _extract_nc4_variable_encoding(var, raise_on_invalid=True) var = xr.Variable(('x',), [1, 2, 3], {}, {'chunking': (2, 1)}) - encoding = _extract_nc4_encoding(var) + encoding = _extract_nc4_variable_encoding(var) self.assertEqual({}, encoding) def test_extract_h5nc_encoding(self): @@ -1216,11 +1226,13 @@ def test_extract_h5nc_encoding(self): var = xr.Variable(('x',), [1, 2, 3], {}, {'least_sigificant_digit': 2}) with self.assertRaisesRegexp(ValueError, 'unexpected encoding'): - _extract_nc4_encoding(var, raise_on_invalid=True) + _extract_nc4_variable_encoding(var, raise_on_invalid=True) + class MiscObject: pass + @requires_netCDF4 class TestValidateAttrs(TestCase): def test_validating_attrs(self): @@ -1320,6 +1332,7 @@ def new_dataset_and_coord_attrs(): with create_tmp_file() as tmp_file: ds.to_netcdf(tmp_file) + @requires_netCDF4 class TestDataArrayToNetCDF(TestCase): @@ -1332,7 +1345,6 @@ def test_dataarray_to_netcdf_no_name(self): with open_dataarray(tmp) as loaded_da: self.assertDataArrayIdentical(original_da, loaded_da) - def test_dataarray_to_netcdf_with_name(self): original_da = DataArray(np.arange(12).reshape((3, 4)), name='test') @@ -1343,7 +1355,6 @@ def test_dataarray_to_netcdf_with_name(self): with open_dataarray(tmp) as loaded_da: self.assertDataArrayIdentical(original_da, loaded_da) - def test_dataarray_to_netcdf_coord_name_clash(self): original_da = DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], From bedad43d09800b0df694faf0316fa67524cc0043 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 20 Dec 2016 11:37:50 -0800 Subject: [PATCH 02/13] unlimited dims for netcdf4, still working on scipy --- xarray/backends/api.py | 2 ++ xarray/backends/common.py | 4 ++++ xarray/backends/h5netcdf_.py | 5 ++++- xarray/backends/memory.py | 1 + xarray/backends/netCDF4_.py | 28 ++++++++-------------------- xarray/backends/pydap_.py | 1 + xarray/backends/pynio_.py | 7 +++++++ xarray/backends/scipy_.py | 17 ++++++++++++----- xarray/conventions.py | 6 ++++++ xarray/core/dataset.py | 13 +++++++------ xarray/test/test_backends.py | 17 +++++++---------- 11 files changed, 59 insertions(+), 42 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index bc2afa4b373..665f491fc33 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -565,6 +565,8 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None, sync = writer is None store = store_cls(path, mode, format, group, writer) + # Copy dataset encoding to datastore + store.encoding = dataset.encoding try: dataset.dump_to_store(store, sync=sync, encoding=encoding) if isinstance(path, BytesIO): diff --git a/xarray/backends/common.py b/xarray/backends/common.py index d8eda48b943..95d8bcd525d 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -74,6 +74,9 @@ def get_attrs(self): # pragma: no cover def get_variables(self): # pragma: no cover raise NotImplementedError + def get_encoding(self): + return {} + def load(self): """ This loads the variables and attributes simultaneously. @@ -95,6 +98,7 @@ def load(self): This function will be called anytime variables or attributes are requested, so care should be taken to make sure its fast. """ + self.encoding = self.get_encoding() variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in iteritems(self.get_variables())) attributes = FrozenOrderedDict(self.get_attrs()) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 0787240c85b..969d6ff012e 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -100,7 +100,10 @@ def prepare_variable(self, name, variable, check_encoding=False): if dtype is str: dtype = h5py.special_dtype(vlen=unicode_type) - self.set_necessary_dimensions(variable) + unlimited_dims = self.encoding.get('unlimited_dims', set()) + if len(unlimited_dims) > 0: + raise ValueError('h5netcdf does not support unlimited dimensions') + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) if fill_value in ['\x00']: diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py index bc5ab826672..29e2456d54c 100644 --- a/xarray/backends/memory.py +++ b/xarray/backends/memory.py @@ -21,6 +21,7 @@ class InMemoryDataStore(AbstractWritableDataStore): def __init__(self, variables=None, attributes=None, writer=None): self._variables = OrderedDict() if variables is None else variables self._attributes = OrderedDict() if attributes is None else attributes + self.encoding = {} super(InMemoryDataStore, self).__init__(writer) def get_attrs(self): diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 346573d5f5e..0e970ca1ac5 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -143,15 +143,6 @@ def _force_native_endianness(var): return var -def _extract_nc4_dataset_encoding(ds): - import netCDF4 as nc4 - - encoding = {} - encoding['unlimited_dims'] = set( - [k for k, v in ds.dimensions if nc4.isunlimited(v)]) - return encoding - - def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, lsd_okay=True, backend='netCDF4'): encoding = variable.encoding.copy() @@ -217,8 +208,7 @@ def __init__(self, filename, mode='r', format='NETCDF4', group=None, self._opener = opener self._filename = filename self._mode = 'a' if mode == 'w' else mode - self._unlimited_dimensions = set() - self.encoding = None + self.encoding = {} super(NetCDF4DataStore, self).__init__(writer) def open_store_variable(self, name, var): @@ -258,16 +248,14 @@ def get_attrs(self): for k in self.ds.ncattrs()) def get_dimensions(self): - # TODO: these can be combined somehow to avoid iterating throw - # dimensions twice - self._unlimited_dimensions = self._get_unlimited_dimensions() return FrozenOrderedDict((k, len(v)) for k, v in iteritems(self.ds.dimensions)) - def _get_unlimited_dimensions(self): - import netCDF4 as nc4 - return set( - k for k, v in iteritems(self.ds.dimensions) if nc4.isunlimited(v)) + def get_encoding(self): + encoding = {} + encoding['unlimited_dims'] = set( + [k for k, v in self.ds.dimensions.items() if v.isunlimited()]) + return encoding def set_dimension(self, name, length): self.ds.createDimension(name, size=length) @@ -288,8 +276,8 @@ def prepare_variable(self, name, variable, check_encoding=False): variable = encode_nc3_variable(variable) datatype = variable.dtype - self.set_necessary_dimensions( - variable, unlimited_dims=self._unlimited_dimensions) + unlimited_dims = self.encoding.get('unlimited_dims', set()) + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) if fill_value in ['', '\x00']: diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index e3f5cefbb03..0cebe2c8b5a 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -62,6 +62,7 @@ class PydapDataStore(AbstractDataStore): def __init__(self, url): import pydap.client self.ds = pydap.client.open_url(url) + self.encoding = {} def open_store_variable(self, var): data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 075db5d4ccb..0affcad4562 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -42,6 +42,7 @@ def __init__(self, filename, mode='r'): self.ds = opener() self._opener = opener self._mode = mode + self.encoding = {} def open_store_variable(self, name, var): data = indexing.LazilyIndexedArray(NioArrayWrapper(name, self)) @@ -57,5 +58,11 @@ def get_attrs(self): def get_dimensions(self): return Frozen(self.ds.dimensions) + def get_encoding(self): + encoding = {} + encoding['unlimited_dims'] = set( + [k for k in self.ds.dimensions if self.ds.unlimited(k)]) + return encoding + def close(self): self.ds.close() diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 24233299d82..984f062afc6 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -102,12 +102,10 @@ def __init__(self, filename_or_obj, mode='r', format=None, group=None, self.ds = opener() self._opener = opener self._mode = mode + self.encoding = {} super(ScipyDataStore, self).__init__(writer) - self._unlimited_dimensions = set() - self.encoding = {} - def open_store_variable(self, name, var): return Variable(var.dimensions, ScipyArrayWrapper(name, self), _decode_attrs(var._attributes)) @@ -126,6 +124,12 @@ def get_dimensions(self): self._unlimited_dimensions = self._get_unlimited_dimensions() return Frozen(self.ds.dimensions) + def get_encoding(self): + encoding = {} + encoding['unlimited_dims'] = set( + [k for k, v in self.ds.dimensions.items() if v is None]) + return encoding + def set_dimension(self, name, length): if name in self.dimensions: raise ValueError('%s does not support modifying dimensions' @@ -147,8 +151,11 @@ def prepare_variable(self, name, variable, check_encoding=False): raise ValueError('unexpected encoding for scipy backend: %r' % list(variable.encoding)) - self.set_necessary_dimensions( - variable, unlimited_dims=self._unlimited_dimensions) + unlimited_dims = self.encoding.get('unlimited_dims', set()) + + if len(unlimited_dims) > 1: + raise ValueError('NETCDF3 only supports one unlimited dimension') + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) data = variable.data # nb. this still creates a numpy array in all memory, even though we diff --git a/xarray/conventions.py b/xarray/conventions.py index c93073bf359..8e9b91d0c81 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -941,6 +941,11 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, vars, attrs = obj.load() extra_coords = set() file_obj = obj + # if obj._unlimited_dimensions: + # unlimited_dims = obj._unlimited_dimensions + # print("unlimited_dims", unlimited_dims) + # else: + # unlimited_dims = set() else: raise TypeError('can only decode Dataset or DataStore objects') @@ -951,6 +956,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj ds.encoding = obj.encoding + return ds diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2f367d65dc1..160ea2cad6d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -223,7 +223,6 @@ def __init__(self, data_vars=None, coords=None, attrs=None, self._coord_names = set() self._dims = {} self._attrs = None - self._encoding = None self._file_obj = None if data_vars is None: data_vars = {} @@ -233,9 +232,10 @@ def __init__(self, data_vars=None, coords=None, attrs=None, self._set_init_vars_and_dims(data_vars, coords, compat) if attrs is not None: self.attrs = attrs - self._initialized = True + self._encoding = None if encoding is not None: self.encoding = encoding + self._initialized = True def _set_init_vars_and_dims(self, data_vars, coords, compat): """Set the initial value of Dataset variables and dimensions @@ -291,12 +291,12 @@ def encoding(self): """Dictionary of global encoding attributes on this dataset """ if self._encoding is None: - self._encoding = OrderedDict() + self._encoding = dict() return self._encoding @encoding.setter def encoding(self, value): - self._encoding = OrderedDict(value) + self._encoding = dict(value) @property def dims(self): @@ -368,7 +368,7 @@ def compute(self): @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, - file_obj=None): + file_obj=None, encoding=None): """Shortcut around __init__ for internal use when we want to skip costly validation """ @@ -378,6 +378,7 @@ def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, obj._dims = dims obj._attrs = attrs obj._file_obj = file_obj + obj._encoding = encoding obj._initialized = True return obj @@ -535,7 +536,7 @@ def __deepcopy__(self, memo=None): @property def _attr_sources(self): """List of places to look-up items for attribute-style access""" - return [self, LevelCoordinatesSource(self), self.attrs, self.encoding] + return [self, LevelCoordinatesSource(self), self.attrs] def __contains__(self, key): """The 'in' operator will return true or false depending on whether diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index a123e9a28c4..9c9050ea895 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -736,16 +736,6 @@ def test_unsorted_index_raises(self): except IndexError as err: self.assertIn('first by calling .load', str(err)) - def test_encoding_unlimited_dims(self): - ds = Dataset({'x': ('y', np.arange(10.0))}) - # also test for netcdf3 - kwargs = dict(format='NETCDF4', - encoding={'y': {'unlimited_dims': ['y']}}) - ds.to_netcdf('test.nc', **kwargs) - with self.roundtrip(ds, save_kwargs=kwargs) as actual: - self.assertEqual(actual.x.encoding['unlimited_dims'], set('y')) - self.assertEqual(ds.x.encoding, {}) - @requires_netCDF4 @requires_dask @@ -924,6 +914,13 @@ def test_cross_engine_read_write_netcdf3(self): engine=read_engine) as actual: self.assertDatasetAllClose(data, actual) + def test_encoding_unlimited_dims(self): + ds = Dataset({'x': ('y', np.arange(10.0))}) + ds.encoding = {'unlimited_dims': ['y']} + ds.to_netcdf('test.nc') + with self.roundtrip(ds) as actual: + self.assertEqual(actual.x.encoding['unlimited_dims'], set('y')) + @requires_h5netcdf @requires_netCDF4 From c797511d9a7a68ddbebedf5137268e95e222508f Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 20 Dec 2016 12:03:58 -0800 Subject: [PATCH 03/13] fix two bugs in h5netcdf tests --- xarray/backends/h5netcdf_.py | 1 + xarray/test/test_backends.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 969d6ff012e..85d5bb7743d 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -58,6 +58,7 @@ def __init__(self, filename, mode='r', format=None, group=None, self._opener = opener self._filename = filename self._mode = mode + self.encoding = {} super(H5NetCDFStore, self).__init__(writer) def open_store_variable(self, name, var): diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 9c9050ea895..eadd1b20ecf 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -917,7 +917,6 @@ def test_cross_engine_read_write_netcdf3(self): def test_encoding_unlimited_dims(self): ds = Dataset({'x': ('y', np.arange(10.0))}) ds.encoding = {'unlimited_dims': ['y']} - ds.to_netcdf('test.nc') with self.roundtrip(ds) as actual: self.assertEqual(actual.x.encoding['unlimited_dims'], set('y')) From e794165307fac86df073fd1106ed460f86432e33 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 27 Dec 2016 10:46:41 -0800 Subject: [PATCH 04/13] fix failing tests, try workaround for scipy/scipy#6880 --- xarray/backends/common.py | 6 +++++- xarray/backends/h5netcdf_.py | 5 ++++- xarray/test/test_backends.py | 7 ++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 95d8bcd525d..198c0c5ef81 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -146,7 +146,11 @@ def add(self, source, target): self.sources.append(source) self.targets.append(target) else: - target[...] = source + try: + target[...] = source + except TypeError: + # workaround for GH: scipy/scipy#6880 + target[slice(None, None, None)] = source def sync(self): if self.sources: diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 85d5bb7743d..18c294e0296 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function import functools +import warnings from .. import Variable from ..core import indexing @@ -103,7 +104,9 @@ def prepare_variable(self, name, variable, check_encoding=False): unlimited_dims = self.encoding.get('unlimited_dims', set()) if len(unlimited_dims) > 0: - raise ValueError('h5netcdf does not support unlimited dimensions') + warnings.warn('h5netcdf does not support unlimited dimensions', + UserWarning) + unlimited_dims = set() self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 45445f7098f..80cf9c13b79 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -924,7 +924,7 @@ def test_encoding_unlimited_dims(self): ds = Dataset({'x': ('y', np.arange(10.0))}) ds.encoding = {'unlimited_dims': ['y']} with self.roundtrip(ds) as actual: - self.assertEqual(actual.x.encoding['unlimited_dims'], set('y')) + self.assertEqual(actual.encoding['unlimited_dims'], set('y')) @requires_h5netcdf @@ -978,6 +978,11 @@ def test_read_byte_attrs_as_unicode(self): expected = Dataset(attrs={'foo': 'bar'}) self.assertDatasetIdentical(expected, actual) + def test_encoding_unlimited_dims(self): + ds = Dataset({'x': ('y', np.arange(10.0))}) + ds.encoding = {'unlimited_dims': ['y']} + with pytest.warns(UserWarning): + ds.to_netcdf('foo-bar.nc', engine='h5netcdf') @requires_dask @requires_scipy From 2ba66887554435ddc267b7278edbcdbf0ca01ca3 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 27 Dec 2016 10:55:53 -0800 Subject: [PATCH 05/13] cleanup --- xarray/conventions.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 8e9b91d0c81..7265a54f595 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -941,11 +941,6 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, vars, attrs = obj.load() extra_coords = set() file_obj = obj - # if obj._unlimited_dimensions: - # unlimited_dims = obj._unlimited_dimensions - # print("unlimited_dims", unlimited_dims) - # else: - # unlimited_dims = set() else: raise TypeError('can only decode Dataset or DataStore objects') From b7bd0b8028b35de25c18f722adb5b72e1aeaabca Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 27 Dec 2016 11:01:50 -0800 Subject: [PATCH 06/13] simple slice in scipy workaround --- doc/api.rst | 1 + xarray/backends/common.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index fd25a09de23..0e3fd71eeb6 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -46,6 +46,7 @@ Attributes Dataset.data_vars Dataset.coords Dataset.attrs + Dataset.encoding Dataset.indexes Dataset.get_index diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 198c0c5ef81..6abf3f5f6c0 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -150,7 +150,7 @@ def add(self, source, target): target[...] = source except TypeError: # workaround for GH: scipy/scipy#6880 - target[slice(None, None, None)] = source + target[:] = source def sync(self): if self.sources: From fdbd55d15da77770ebbd50f6a99ba1d8dc81cdde Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 13:36:27 -0800 Subject: [PATCH 07/13] initial fixes after @shoyer's review --- xarray/backends/api.py | 5 +++-- xarray/backends/common.py | 19 +++++++++++++------ xarray/backends/h5netcdf_.py | 8 +++++--- xarray/backends/netCDF4_.py | 10 ++++++---- xarray/backends/scipy_.py | 7 ++++--- xarray/core/dataset.py | 23 ++++++++++++++++++----- 6 files changed, 49 insertions(+), 23 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 665f491fc33..74bf6a7e139 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -526,7 +526,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, def to_netcdf(dataset, path=None, mode='w', format=None, group=None, - engine=None, writer=None, encoding=None): + engine=None, writer=None, encoding=None, unlimited_dims=None): """This function creates an appropriate datastore for writing a dataset to disk as a netCDF file @@ -568,7 +568,8 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None, # Copy dataset encoding to datastore store.encoding = dataset.encoding try: - dataset.dump_to_store(store, sync=sync, encoding=encoding) + dataset.dump_to_store(store, sync=sync, encoding=encoding, + unlimited_dims=unlimited_dims) if isinstance(path, BytesIO): return path.getvalue() finally: diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 6abf3f5f6c0..e653d6ef44f 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -189,22 +189,28 @@ def store_dataset(self, dataset): # dataset.variables self.store(dataset, dataset.attrs) - def store(self, variables, attributes, check_encoding_set=frozenset()): + def store(self, variables, attributes, check_encoding_set=frozenset(), + unlimited_dims=None): self.set_attributes(attributes) - self.set_variables(variables, check_encoding_set) + self.set_variables(variables, check_encoding_set, + unlimited_dims=unlimited_dims) def set_attributes(self, attributes): for k, v in iteritems(attributes): self.set_attribute(k, v) - def set_variables(self, variables, check_encoding_set): + def set_variables(self, variables, check_encoding_set, + unlimited_dims=None): for vn, v in iteritems(variables): name = _encode_variable_name(vn) check = vn in check_encoding_set - target, source = self.prepare_variable(name, v, check) + target, source = self.prepare_variable( + name, v, check, unlimited_dims=unlimited_dims) self.writer.add(source, target) - def set_necessary_dimensions(self, variable, unlimited_dims=set()): + def set_necessary_dimensions(self, variable, unlimited_dims=None): + if unlimited_dims is None: + unlimited_dims = set() for d, l in zip(variable.dims, variable.shape): if d not in self.dimensions: if d in unlimited_dims: @@ -213,7 +219,8 @@ def set_necessary_dimensions(self, variable, unlimited_dims=set()): class WritableCFDataStore(AbstractWritableDataStore): - def store(self, variables, attributes, check_encoding_set=frozenset()): + def store(self, variables, attributes, check_encoding_set=frozenset(), + unlimited_dims=None): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. cf_variables, cf_attrs = cf_encoder(variables, attributes) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 18c294e0296..8fc6e0b3d78 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -94,7 +94,8 @@ def set_dimension(self, name, length): def set_attribute(self, key, value): self.ds.setncattr(key, value) - def prepare_variable(self, name, variable, check_encoding=False): + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): import h5py attrs = variable.attrs.copy() @@ -102,8 +103,9 @@ def prepare_variable(self, name, variable, check_encoding=False): if dtype is str: dtype = h5py.special_dtype(vlen=unicode_type) - unlimited_dims = self.encoding.get('unlimited_dims', set()) - if len(unlimited_dims) > 0: + if unlimited_dims is None: + unlimited_dims = self.encoding.get('unlimited_dims', set()) + if unlimited_dims is not None or len(unlimited_dims) > 0: warnings.warn('h5netcdf does not support unlimited dimensions', UserWarning) unlimited_dims = set() diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 0e970ca1ac5..7e7fe8f5a85 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -253,8 +253,8 @@ def get_dimensions(self): def get_encoding(self): encoding = {} - encoding['unlimited_dims'] = set( - [k for k, v in self.ds.dimensions.items() if v.isunlimited()]) + encoding['unlimited_dims'] = { + k for k, v in self.ds.dimensions.items() if v.isunlimited()} return encoding def set_dimension(self, name, length): @@ -265,7 +265,8 @@ def set_attribute(self, key, value): value = encode_nc3_attr_value(value) self.ds.setncattr(key, value) - def prepare_variable(self, name, variable, check_encoding=False): + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): attrs = variable.attrs.copy() variable = _force_native_endianness(variable) @@ -276,7 +277,8 @@ def prepare_variable(self, name, variable, check_encoding=False): variable = encode_nc3_variable(variable) datatype = variable.dtype - unlimited_dims = self.encoding.get('unlimited_dims', set()) + if unlimited_dims is None: + unlimited_dims = self.encoding.get('unlimited_dims', set()) self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 984f062afc6..237662dedf1 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -145,13 +145,14 @@ def set_attribute(self, key, value): value = encode_nc3_attr_value(value) setattr(self.ds, key, value) - def prepare_variable(self, name, variable, check_encoding=False): + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): variable = encode_nc3_variable(variable) if check_encoding and variable.encoding: raise ValueError('unexpected encoding for scipy backend: %r' % list(variable.encoding)) - - unlimited_dims = self.encoding.get('unlimited_dims', set()) + if unlimited_dims is None: + unlimited_dims = self.encoding.get('unlimited_dims', set()) if len(unlimited_dims) > 1: raise ValueError('NETCDF3 only supports one unlimited dimension') diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6bf19b334dc..7358f1a3814 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -339,6 +339,11 @@ def __init__(self, data_vars=None, coords=None, attrs=None, - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. + encoding : dict_like or None, optional + Dictionary specifying how to encode this Dataset's data into a + serialized format like netCDF. Currently used keys (for netCDF) + include 'unlimited_dims'. + Unrecognized keys are ignored. """ self._variables = OrderedDict() self._coord_names = set() @@ -412,7 +417,7 @@ def encoding(self): """Dictionary of global encoding attributes on this dataset """ if self._encoding is None: - self._encoding = dict() + self._encoding = {} return self._encoding @encoding.setter @@ -865,7 +870,8 @@ def reset_coords(self, names=None, drop=False, inplace=False): del obj._variables[name] return obj - def dump_to_store(self, store, encoder=None, sync=True, encoding=None): + def dump_to_store(self, store, encoder=None, sync=True, encoding=None, + unlimited_dims=None): """Store dataset contents to a backends.*DataStore object.""" if encoding is None: encoding = {} @@ -881,12 +887,13 @@ def dump_to_store(self, store, encoder=None, sync=True, encoding=None): if encoder: variables, attrs = encoder(variables, attrs) - store.store(variables, attrs, check_encoding) + store.store(variables, attrs, check_encoding, + unlimited_dims=unlimited_dims) if sync: store.sync() def to_netcdf(self, path=None, mode='w', format=None, group=None, - engine=None, encoding=None): + engine=None, encoding=None, unlimited_dims=None): """Write dataset contents to a netCDF file. Parameters @@ -930,12 +937,18 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}`` + unlimited_dims : str or sequence of str, optional + Dimension(s) that should be serialized as unlimited dimensions. + By default, no dimensions are treated as unlimited dimensions. + Note that unlimited_dims may also be set via + ``dataset.encoding['unlimited_dims']``. """ if encoding is None: encoding = {} from ..backends.api import to_netcdf return to_netcdf(self, path, mode, format=format, group=group, - engine=engine, encoding=encoding) + engine=engine, encoding=encoding, + unlimited_dims=unlimited_dims) def __unicode__(self): return formatting.dataset_repr(self) From 47442e6f895109758e41e8c6df144461102ee827 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 14:19:00 -0800 Subject: [PATCH 08/13] fix failing test by passing unlimited_dims through to in memory store --- xarray/backends/common.py | 5 ++--- xarray/backends/memory.py | 2 +- xarray/test/test_dataset.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index e653d6ef44f..2f37530c22a 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -219,13 +219,12 @@ def set_necessary_dimensions(self, variable, unlimited_dims=None): class WritableCFDataStore(AbstractWritableDataStore): - def store(self, variables, attributes, check_encoding_set=frozenset(), - unlimited_dims=None): + def store(self, variables, attributes, *args, **kwargs): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. cf_variables, cf_attrs = cf_encoder(variables, attributes) AbstractWritableDataStore.store(self, cf_variables, cf_attrs, - check_encoding_set) + *args, **kwargs) class DataStorePickleMixin(object): diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py index 29e2456d54c..3d43ba6ca61 100644 --- a/xarray/backends/memory.py +++ b/xarray/backends/memory.py @@ -30,7 +30,7 @@ def get_attrs(self): def get_variables(self): return self._variables - def prepare_variable(self, k, v, check_encoding=False): + def prepare_variable(self, k, v, *args, **kwargs): new_var = Variable(v.dims, np.empty_like(v), v.attrs) # we copy the variable and stuff all encodings in the # attributes to imitate what happens when writing to disk. diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 54f6cb273a9..f3b3b475531 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -61,9 +61,9 @@ def __init__(self, writer=None): super(InaccessibleVariableDataStore, self).__init__(writer) self._indexvars = set() - def store(self, variables, attributes, check_encoding_set=frozenset()): + def store(self, variables, *args, **kwargs): super(InaccessibleVariableDataStore, self).store( - variables, attributes, check_encoding_set) + variables, *args, **kwargs) for k, v in variables.items(): if isinstance(v, IndexVariable): self._indexvars.add(k) From 2df224c8a7836495b5f5afcf5b59317d6289800a Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 14:21:04 -0800 Subject: [PATCH 09/13] remove encoding from dataset constructor --- xarray/core/dataset.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7358f1a3814..72148d11c82 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -28,7 +28,6 @@ broadcast_variables) from .pycompat import (iteritems, basestring, OrderedDict, dask_array_type, range) -from .formatting import ensure_valid_repr from .combine import concat from .options import OPTIONS @@ -306,7 +305,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, BaseDataObject, groupby_cls = groupby.DatasetGroupBy def __init__(self, data_vars=None, coords=None, attrs=None, - compat='broadcast_equals', encoding=None): + compat='broadcast_equals'): """To load data from a file or file-like object, use the `open_dataset` function. @@ -339,11 +338,6 @@ def __init__(self, data_vars=None, coords=None, attrs=None, - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - encoding : dict_like or None, optional - Dictionary specifying how to encode this Dataset's data into a - serialized format like netCDF. Currently used keys (for netCDF) - include 'unlimited_dims'. - Unrecognized keys are ignored. """ self._variables = OrderedDict() self._coord_names = set() @@ -359,8 +353,6 @@ def __init__(self, data_vars=None, coords=None, attrs=None, if attrs is not None: self.attrs = attrs self._encoding = None - if encoding is not None: - self.encoding = encoding self._initialized = True def _set_init_vars_and_dims(self, data_vars, coords, compat): From eead3e4d3e94b1af7da782bb66c2e24b23901500 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 15:32:34 -0800 Subject: [PATCH 10/13] more tests for unlimited_dims and update whats-new --- doc/whats-new.rst | 6 +++--- xarray/backends/h5netcdf_.py | 3 +++ xarray/test/test_backends.py | 8 ++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 15149abbbc9..aeb52603ec4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -165,9 +165,9 @@ Enhancements and attributes. The method prints to a buffer (e.g. ``stdout``) with output similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`). By `Joe Hamman `_. - -- Added the ability write unlimited netCDF dimensions with the ``netcdf4`` - backend. +- Added the ability write unlimited netCDF dimensions with the ``scipy`` and + ``netcdf4`` backends via the new :py:attr:`~xray.Dataset.encoding` attribute + or via the ``unlimited_dims`` argument to :py:meth:`~xray.Dataset.to_netcdf`. By `Joe Hamman `_. Bug fixes diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 8fc6e0b3d78..1a711049c0e 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -105,6 +105,9 @@ def prepare_variable(self, name, variable, check_encoding=False, if unlimited_dims is None: unlimited_dims = self.encoding.get('unlimited_dims', set()) + else: + raise NotImplementedError('h5netcdf does not support unlimited' + 'dimensions') if unlimited_dims is not None or len(unlimited_dims) > 0: warnings.warn('h5netcdf does not support unlimited dimensions', UserWarning) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 80cf9c13b79..477072268d3 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -922,9 +922,14 @@ def test_cross_engine_read_write_netcdf3(self): def test_encoding_unlimited_dims(self): ds = Dataset({'x': ('y', np.arange(10.0))}) + with self.roundtrip(ds, + save_kwargs=dict(unlimited_dims=['y'])) as actual: + self.assertEqual(actual.encoding['unlimited_dims'], set('y')) + self.assertDatasetEqual(ds, actual) ds.encoding = {'unlimited_dims': ['y']} with self.roundtrip(ds) as actual: self.assertEqual(actual.encoding['unlimited_dims'], set('y')) + self.assertDatasetEqual(ds, actual) @requires_h5netcdf @@ -983,6 +988,9 @@ def test_encoding_unlimited_dims(self): ds.encoding = {'unlimited_dims': ['y']} with pytest.warns(UserWarning): ds.to_netcdf('foo-bar.nc', engine='h5netcdf') + with pytest.raises(NotImplementedError): + ds.to_netcdf('foo-bar.nc', engine='h5netcdf', unlimited_dims=['y']) + @requires_dask @requires_scipy From 33dd0626227d6018a3e971a456d2993e66c2aaf1 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 18 Jan 2017 23:43:59 -0700 Subject: [PATCH 11/13] refactor unlimited dimensions / dataset encoding to avoid using DataStore statespace, respond to a few of @shoyer's comments --- xarray/backends/api.py | 5 +++-- xarray/backends/common.py | 2 -- xarray/backends/h5netcdf_.py | 9 +-------- xarray/backends/netCDF4_.py | 3 --- xarray/backends/scipy_.py | 9 +++------ xarray/conventions.py | 4 +++- xarray/core/dataset.py | 2 +- 7 files changed, 11 insertions(+), 23 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b057e8bff4d..441bf08bb8e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -561,8 +561,9 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None, sync = writer is None store = store_cls(path, mode, format, group, writer) - # Copy dataset encoding to datastore - store.encoding = dataset.encoding + + if unlimited_dims is None: + unlimited_dims = dataset.encoding.get('unlimited_dims', None) try: dataset.dump_to_store(store, sync=sync, encoding=encoding, unlimited_dims=unlimited_dims) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index e1c0f7a6169..e7cbd0bd9ae 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -5,7 +5,6 @@ import logging import time import traceback -import threading from collections import Mapping from distutils.version import StrictVersion @@ -108,7 +107,6 @@ def load(self): This function will be called anytime variables or attributes are requested, so care should be taken to make sure its fast. """ - self.encoding = self.get_encoding() variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in iteritems(self.get_variables())) attributes = FrozenOrderedDict(self.get_attrs()) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 1a711049c0e..b53973507a4 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -2,7 +2,6 @@ from __future__ import division from __future__ import print_function import functools -import warnings from .. import Variable from ..core import indexing @@ -103,15 +102,9 @@ def prepare_variable(self, name, variable, check_encoding=False, if dtype is str: dtype = h5py.special_dtype(vlen=unicode_type) - if unlimited_dims is None: - unlimited_dims = self.encoding.get('unlimited_dims', set()) - else: + if unlimited_dims is not None: raise NotImplementedError('h5netcdf does not support unlimited' 'dimensions') - if unlimited_dims is not None or len(unlimited_dims) > 0: - warnings.warn('h5netcdf does not support unlimited dimensions', - UserWarning) - unlimited_dims = set() self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 7e7fe8f5a85..f72a26affaf 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -208,7 +208,6 @@ def __init__(self, filename, mode='r', format='NETCDF4', group=None, self._opener = opener self._filename = filename self._mode = 'a' if mode == 'w' else mode - self.encoding = {} super(NetCDF4DataStore, self).__init__(writer) def open_store_variable(self, name, var): @@ -277,8 +276,6 @@ def prepare_variable(self, name, variable, check_encoding=False, variable = encode_nc3_variable(variable) datatype = variable.dtype - if unlimited_dims is None: - unlimited_dims = self.encoding.get('unlimited_dims', set()) self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 237662dedf1..d419735d744 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -121,13 +121,12 @@ def _get_unlimited_dimensions(self): return set(k for k, v in iteritems(self.ds.dimensions) if v is None) def get_dimensions(self): - self._unlimited_dimensions = self._get_unlimited_dimensions() return Frozen(self.ds.dimensions) def get_encoding(self): encoding = {} - encoding['unlimited_dims'] = set( - [k for k, v in self.ds.dimensions.items() if v is None]) + encoding['unlimited_dims'] = { + k for k, v in self.ds.dimensions.items() if v is None} return encoding def set_dimension(self, name, length): @@ -151,10 +150,8 @@ def prepare_variable(self, name, variable, check_encoding=False, if check_encoding and variable.encoding: raise ValueError('unexpected encoding for scipy backend: %r' % list(variable.encoding)) - if unlimited_dims is None: - unlimited_dims = self.encoding.get('unlimited_dims', set()) - if len(unlimited_dims) > 1: + if unlimited_dims is not None and len(unlimited_dims) > 1: raise ValueError('NETCDF3 only supports one unlimited dimension') self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) diff --git a/xarray/conventions.py b/xarray/conventions.py index 7265a54f595..178ee8442db 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -937,10 +937,12 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, attrs = obj.attrs extra_coords = set(obj.coords) file_obj = obj._file_obj + encoding = obj.encoding elif isinstance(obj, AbstractDataStore): vars, attrs = obj.load() extra_coords = set() file_obj = obj + encoding = obj.get_encoding() else: raise TypeError('can only decode Dataset or DataStore objects') @@ -950,7 +952,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj - ds.encoding = obj.encoding + ds.encoding = encoding return ds diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1f275bf5e57..3288bca9514 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -936,7 +936,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}`` - unlimited_dims : str or sequence of str, optional + unlimited_dims : sequence of str, optional Dimension(s) that should be serialized as unlimited dimensions. By default, no dimensions are treated as unlimited dimensions. Note that unlimited_dims may also be set via From 65df346fcd34fef4cdb8ccab257224cc7ebe4cce Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 19 Jan 2017 09:00:20 -0700 Subject: [PATCH 12/13] raise user warning if unlimited dims is used with h5netcdf --- xarray/backends/h5netcdf_.py | 6 ++++-- xarray/test/test_backends.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index b53973507a4..b2cf60f15d0 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function import functools +import warnings from .. import Variable from ..core import indexing @@ -103,8 +104,9 @@ def prepare_variable(self, name, variable, check_encoding=False, dtype = h5py.special_dtype(vlen=unicode_type) if unlimited_dims is not None: - raise NotImplementedError('h5netcdf does not support unlimited' - 'dimensions') + warnings.warn('h5netcdf does not support unlimited dimensions, ' + 'got: %s.' % unlimited_dims) + unlimited_dims = None self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) fill_value = attrs.pop('_FillValue', None) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 241be37aa33..6a204bb5ad6 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -988,7 +988,7 @@ def test_encoding_unlimited_dims(self): ds.encoding = {'unlimited_dims': ['y']} with pytest.warns(UserWarning): ds.to_netcdf('foo-bar.nc', engine='h5netcdf') - with pytest.raises(NotImplementedError): + with pytest.warns(UserWarning): ds.to_netcdf('foo-bar.nc', engine='h5netcdf', unlimited_dims=['y']) From db964a1a34cc42f85608a1430e6f06d4d45d8e67 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Mon, 23 Jan 2017 10:21:33 -0800 Subject: [PATCH 13/13] cleanup backends after unlimited_dims changes --- xarray/backends/memory.py | 1 - xarray/backends/pydap_.py | 1 - xarray/backends/pynio_.py | 1 - xarray/backends/scipy_.py | 4 ---- 4 files changed, 7 deletions(-) diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py index 3d43ba6ca61..f79e92439fe 100644 --- a/xarray/backends/memory.py +++ b/xarray/backends/memory.py @@ -21,7 +21,6 @@ class InMemoryDataStore(AbstractWritableDataStore): def __init__(self, variables=None, attributes=None, writer=None): self._variables = OrderedDict() if variables is None else variables self._attributes = OrderedDict() if attributes is None else attributes - self.encoding = {} super(InMemoryDataStore, self).__init__(writer) def get_attrs(self): diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 0cebe2c8b5a..e3f5cefbb03 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -62,7 +62,6 @@ class PydapDataStore(AbstractDataStore): def __init__(self, url): import pydap.client self.ds = pydap.client.open_url(url) - self.encoding = {} def open_store_variable(self, var): data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 0affcad4562..8bb759503b9 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -42,7 +42,6 @@ def __init__(self, filename, mode='r'): self.ds = opener() self._opener = opener self._mode = mode - self.encoding = {} def open_store_variable(self, name, var): data = indexing.LazilyIndexedArray(NioArrayWrapper(name, self)) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index d419735d744..281d23fa40a 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -102,7 +102,6 @@ def __init__(self, filename_or_obj, mode='r', format=None, group=None, self.ds = opener() self._opener = opener self._mode = mode - self.encoding = {} super(ScipyDataStore, self).__init__(writer) @@ -117,9 +116,6 @@ def get_variables(self): def get_attrs(self): return Frozen(_decode_attrs(self.ds._attributes)) - def _get_unlimited_dimensions(self): - return set(k for k, v in iteritems(self.ds.dimensions) if v is None) - def get_dimensions(self): return Frozen(self.ds.dimensions)