-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Dataset.encoding and unlimited dimensions for to_netcdf #1170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
271a751
bedad43
c797511
affac00
ca60729
3d24610
e794165
2ba6688
b7bd0b8
fdbd55d
47442e6
2df224c
eead3e4
fac2f89
33dd062
65df346
b076c15
db964a1
cb22ba1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,6 @@ | |
from __future__ import division | ||
from __future__ import print_function | ||
import numpy as np | ||
import itertools | ||
import logging | ||
import time | ||
import traceback | ||
|
@@ -12,7 +11,7 @@ | |
|
||
from ..conventions import cf_encoder | ||
from ..core.utils import FrozenOrderedDict | ||
from ..core.pycompat import iteritems, dask_array_type, OrderedDict | ||
from ..core.pycompat import iteritems, dask_array_type | ||
|
||
# Create a logger object, but don't add any handlers. Leave that to user code. | ||
logger = logging.getLogger(__name__) | ||
|
@@ -75,6 +74,9 @@ def get_attrs(self): # pragma: no cover | |
def get_variables(self): # pragma: no cover | ||
raise NotImplementedError | ||
|
||
def get_encoding(self): | ||
return {} | ||
|
||
def load(self): | ||
""" | ||
This loads the variables and attributes simultaneously. | ||
|
@@ -96,8 +98,9 @@ def load(self): | |
This function will be called anytime variables or attributes | ||
are requested, so care should be taken to make sure its fast. | ||
""" | ||
variables = FrozenOrderedDict((_decode_variable_name(k), v) | ||
for k, v in iteritems(self.get_variables())) | ||
self.encoding = self.get_encoding() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a little dangerous -- There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair point, I've removed the |
||
variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in | ||
iteritems(self.get_variables())) | ||
attributes = FrozenOrderedDict(self.get_attrs()) | ||
return variables, attributes | ||
|
||
|
@@ -143,7 +146,11 @@ def add(self, source, target): | |
self.sources.append(source) | ||
self.targets.append(target) | ||
else: | ||
target[...] = source | ||
try: | ||
target[...] = source | ||
except TypeError: | ||
# workaround for GH: scipy/scipy#6880 | ||
target[:] = source | ||
|
||
def sync(self): | ||
if self.sources: | ||
|
@@ -197,9 +204,11 @@ def set_variables(self, variables, check_encoding_set): | |
target, source = self.prepare_variable(name, v, check) | ||
self.writer.add(source, target) | ||
|
||
def set_necessary_dimensions(self, variable): | ||
def set_necessary_dimensions(self, variable, unlimited_dims=set()): | ||
for d, l in zip(variable.dims, variable.shape): | ||
if d not in self.dimensions: | ||
if d in unlimited_dims: | ||
l = None | ||
self.set_dimension(d, l) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,15 +2,16 @@ | |
from __future__ import division | ||
from __future__ import print_function | ||
import functools | ||
import warnings | ||
|
||
from .. import Variable | ||
from ..core import indexing | ||
from ..core.utils import FrozenOrderedDict, close_on_error, Frozen | ||
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict | ||
|
||
from .common import WritableCFDataStore, DataStorePickleMixin | ||
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding, | ||
BaseNetCDF4Array) | ||
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, | ||
_extract_nc4_variable_encoding, BaseNetCDF4Array) | ||
|
||
|
||
def maybe_decode_bytes(txt): | ||
|
@@ -33,7 +34,7 @@ def _read_attributes(h5netcdf_var): | |
return attrs | ||
|
||
|
||
_extract_h5nc_encoding = functools.partial(_extract_nc4_encoding, | ||
_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding, | ||
lsd_okay=False, backend='h5netcdf') | ||
|
||
|
||
|
@@ -58,6 +59,7 @@ def __init__(self, filename, mode='r', format=None, group=None, | |
self._opener = opener | ||
self._filename = filename | ||
self._mode = mode | ||
self.encoding = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This still should go away :) |
||
super(H5NetCDFStore, self).__init__(writer) | ||
|
||
def open_store_variable(self, name, var): | ||
|
@@ -100,7 +102,12 @@ def prepare_variable(self, name, variable, check_encoding=False): | |
if dtype is str: | ||
dtype = h5py.special_dtype(vlen=unicode_type) | ||
|
||
self.set_necessary_dimensions(variable) | ||
unlimited_dims = self.encoding.get('unlimited_dims', set()) | ||
if len(unlimited_dims) > 0: | ||
warnings.warn('h5netcdf does not support unlimited dimensions', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, |
||
UserWarning) | ||
unlimited_dims = set() | ||
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) | ||
|
||
fill_value = attrs.pop('_FillValue', None) | ||
if fill_value in ['\x00']: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ class InMemoryDataStore(AbstractWritableDataStore): | |
def __init__(self, variables=None, attributes=None, writer=None): | ||
self._variables = OrderedDict() if variables is None else variables | ||
self._attributes = OrderedDict() if attributes is None else attributes | ||
self.encoding = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete |
||
super(InMemoryDataStore, self).__init__(writer) | ||
|
||
def get_attrs(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
import numpy as np | ||
|
||
from .. import Variable | ||
from ..conventions import pop_to, cf_encoder | ||
from ..conventions import pop_to | ||
from ..core import indexing | ||
from ..core.utils import (FrozenOrderedDict, NDArrayMixin, | ||
close_on_error, is_remote_uri) | ||
|
@@ -138,13 +138,13 @@ def _force_native_endianness(var): | |
# check to see if encoding has a value for endian its 'native' | ||
if not var.encoding.get('endian', 'native') is 'native': | ||
raise NotImplementedError("Attempt to write non-native endian type, " | ||
"this is not supported by the netCDF4 python " | ||
"library.") | ||
"this is not supported by the netCDF4 " | ||
"python library.") | ||
return var | ||
|
||
|
||
def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True, | ||
backend='netCDF4'): | ||
def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, | ||
lsd_okay=True, backend='netCDF4'): | ||
encoding = variable.encoding.copy() | ||
|
||
safe_to_drop = set(['source', 'original_shape']) | ||
|
@@ -154,9 +154,8 @@ def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True, | |
valid_encodings.add('least_significant_digit') | ||
|
||
if (encoding.get('chunksizes') is not None and | ||
(encoding.get('original_shape', variable.shape) | ||
!= variable.shape) and | ||
not raise_on_invalid): | ||
(encoding.get('original_shape', variable.shape) != | ||
variable.shape) and not raise_on_invalid): | ||
del encoding['chunksizes'] | ||
|
||
for k in safe_to_drop: | ||
|
@@ -209,6 +208,7 @@ def __init__(self, filename, mode='r', format='NETCDF4', group=None, | |
self._opener = opener | ||
self._filename = filename | ||
self._mode = 'a' if mode == 'w' else mode | ||
self.encoding = {} | ||
super(NetCDF4DataStore, self).__init__(writer) | ||
|
||
def open_store_variable(self, name, var): | ||
|
@@ -251,6 +251,12 @@ def get_dimensions(self): | |
return FrozenOrderedDict((k, len(v)) | ||
for k, v in iteritems(self.ds.dimensions)) | ||
|
||
def get_encoding(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would lean slightly toward just creating a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other encoding value that comes to mind is the dataset format (e.g. NETCDF4 vs. NETCDF3). Maybe there are others as well but nothing is mind. |
||
encoding = {} | ||
encoding['unlimited_dims'] = set( | ||
[k for k, v in self.ds.dimensions.items() if v.isunlimited()]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can use a set comprehension here, e.g.,
|
||
return encoding | ||
|
||
def set_dimension(self, name, length): | ||
self.ds.createDimension(name, size=length) | ||
|
||
|
@@ -270,16 +276,17 @@ def prepare_variable(self, name, variable, check_encoding=False): | |
variable = encode_nc3_variable(variable) | ||
datatype = variable.dtype | ||
|
||
self.set_necessary_dimensions(variable) | ||
unlimited_dims = self.encoding.get('unlimited_dims', set()) | ||
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) | ||
|
||
fill_value = attrs.pop('_FillValue', None) | ||
if fill_value in ['', '\x00']: | ||
# these are equivalent to the default FillValue, but netCDF4 | ||
# doesn't like setting fill_value to an empty string | ||
fill_value = None | ||
|
||
encoding = _extract_nc4_encoding(variable, | ||
raise_on_invalid=check_encoding) | ||
encoding = _extract_nc4_variable_encoding( | ||
variable, raise_on_invalid=check_encoding) | ||
nc4_var = self.ds.createVariable( | ||
varname=name, | ||
datatype=datatype, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,7 @@ class PydapDataStore(AbstractDataStore): | |
def __init__(self, url): | ||
import pydap.client | ||
self.ds = pydap.client.open_url(url) | ||
self.encoding = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete |
||
|
||
def open_store_variable(self, var): | ||
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,7 @@ def __init__(self, filename, mode='r'): | |
self.ds = opener() | ||
self._opener = opener | ||
self._mode = mode | ||
self.encoding = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete |
||
|
||
def open_store_variable(self, name, var): | ||
data = indexing.LazilyIndexedArray(NioArrayWrapper(name, self)) | ||
|
@@ -57,5 +58,11 @@ def get_attrs(self): | |
def get_dimensions(self): | ||
return Frozen(self.ds.dimensions) | ||
|
||
def get_encoding(self): | ||
encoding = {} | ||
encoding['unlimited_dims'] = set( | ||
[k for k in self.ds.dimensions if self.ds.unlimited(k)]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think dap can represent unlimited dimensions: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, but this is pynio which does: https://www.pyngl.ucar.edu/whatsnew.shtml#Version1.4.1 |
||
return encoding | ||
|
||
def close(self): | ||
self.ds.close() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
import warnings | ||
|
||
from .. import Variable | ||
from ..core.pycompat import iteritems, basestring, OrderedDict | ||
from ..core.pycompat import iteritems, OrderedDict | ||
from ..core.utils import Frozen, FrozenOrderedDict | ||
from ..core.indexing import NumpyIndexingAdapter | ||
|
||
|
@@ -102,6 +102,7 @@ def __init__(self, filename_or_obj, mode='r', format=None, group=None, | |
self.ds = opener() | ||
self._opener = opener | ||
self._mode = mode | ||
self.encoding = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete |
||
|
||
super(ScipyDataStore, self).__init__(writer) | ||
|
||
|
@@ -116,9 +117,19 @@ def get_variables(self): | |
def get_attrs(self): | ||
return Frozen(_decode_attrs(self.ds._attributes)) | ||
|
||
def _get_unlimited_dimensions(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you use this method anymore |
||
return set(k for k, v in iteritems(self.ds.dimensions) if v is None) | ||
|
||
def get_dimensions(self): | ||
self._unlimited_dimensions = self._get_unlimited_dimensions() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't use this currently |
||
return Frozen(self.ds.dimensions) | ||
|
||
def get_encoding(self): | ||
encoding = {} | ||
encoding['unlimited_dims'] = set( | ||
[k for k, v in self.ds.dimensions.items() if v is None]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can use the same set comprehension you switched to in |
||
return encoding | ||
|
||
def set_dimension(self, name, length): | ||
if name in self.dimensions: | ||
raise ValueError('%s does not support modifying dimensions' | ||
|
@@ -140,7 +151,12 @@ def prepare_variable(self, name, variable, check_encoding=False): | |
raise ValueError('unexpected encoding for scipy backend: %r' | ||
% list(variable.encoding)) | ||
|
||
self.set_necessary_dimensions(variable) | ||
unlimited_dims = self.encoding.get('unlimited_dims', set()) | ||
|
||
if len(unlimited_dims) > 1: | ||
raise ValueError('NETCDF3 only supports one unlimited dimension') | ||
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) | ||
|
||
data = variable.data | ||
# nb. this still creates a numpy array in all memory, even though we | ||
# don't write the data yet; scipy.io.netcdf does not not support | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we ever actually use this
encoding
state on the datastore? If not, let's not bother setting it. I think everything necessary ends up being passed on viaset_variables
.Note that as bunch as possible, I've tried to make
DataStore
itself stateless, only storing state in the file-like object it points to.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We were using this but I've refactored to avoid it.