From 893c7fa65a8e467cbaf224235511bd6710c331a1 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 29 Jan 2014 15:07:35 -0800 Subject: [PATCH] Mutable variables! With this patch, the Variable object has been refactored and is now mutable. Some of its behavior may have changed in other subtle ways. For example, getting an item from a variable now returns another variable instead of an ndarray. --- src/polyglot/backends.py | 55 ++------- src/polyglot/data.py | 6 - src/polyglot/variable.py | 233 ++++++++++++++++++--------------------- test/test_data.py | 7 +- test/test_variable.py | 49 ++++++++ 5 files changed, 170 insertions(+), 180 deletions(-) create mode 100644 test/test_variable.py diff --git a/src/polyglot/backends.py b/src/polyglot/backends.py index 1098e294ec6..117ea7b34d7 100644 --- a/src/polyglot/backends.py +++ b/src/polyglot/backends.py @@ -57,30 +57,10 @@ def sync(self): class ScipyVariable(variable.Variable): - def __init__(self, scipy_var): - object.__setattr__(self, 'v', scipy_var) - - def _allocate(self): - return variable.Variable(dims=(), data=0) - - @property - def attributes(self): - return self.v._attributes - - def __getattribute__(self, key): - """ - Here we give some of the attributes of self.data preference over - attributes in the object itself. - """ - if key == 'v': - return object.__getattribute__(self, 'v') - elif hasattr(self.v, key): - return object.__getattribute__(self.v, key) - elif not hasattr(self, key) and hasattr(self.v.data, key): - return getattr(self.v.data, key) - else: - return object.__getattribute__(self, key) + self._dimensions = scipy_var.dimensions + self._data = scipy_var.data + self._attributes = scipy_var._attributes class ScipyDataStore(object): @@ -145,12 +125,10 @@ def sync(self): class NetCDF4Variable(variable.Variable): def __init__(self, nc4_variable): - object.__setattr__(self, 'data', - variable.LazyVariableData(nc4_variable)) - object.__setattr__(self, '_attributes', None) - - def _allocate(self): - return variable.Variable(dims=(), data=0) + self._nc4_variable = nc4_variable + self._dimensions = nc4_variable.dimensions + self._data = nc4_variable + self._attributes = None @property def attributes(self): @@ -166,22 +144,13 @@ def attributes(self): # you would find that any packed variables in the original # netcdf file would now have been scaled twice! packing_attributes = ['scale_factor', 'add_offset'] - keys = [k for k in self.ncattrs() if not k in packing_attributes] - attr_dict = variable.AttributesDict((k, self.data.getncattr(k)) - for k in keys) - object.__setattr__(self, '_attributes', attr_dict) + keys = [k for k in self._nc4_variable.ncattrs() + if not k in packing_attributes] + attr_dict = variable.AttributesDict( + (k, self._nc4_variable.getncattr(k)) for k in keys) + self._attributes = attr_dict return self._attributes - def __getattr__(self, attr): - """__getattr__ is overloaded to selectively expose some of the - attributes of the underlying nc4 variable""" - if attr == 'data': - return object.__getattribute__(self, 'data') - elif hasattr(self.data, attr): - return getattr(self.data, attr) - else: - return object.__getattribute__(self, attr) - class NetCDF4DataStore(object): diff --git a/src/polyglot/data.py b/src/polyglot/data.py index bcadf952922..d08257333a8 100644 --- a/src/polyglot/data.py +++ b/src/polyglot/data.py @@ -604,12 +604,6 @@ def update(self, other): # if a dimension is a new one it gets added, if the dimension already # exists we confirm that they are identical (or throw an exception) for (name, length) in other.dimensions.iteritems(): - if (name == other.record_dimension and - name != self.record_dimension): - raise ValueError( - ("record dimensions do not match: " - "self: %s, other: %s") % - (self.record_dimension, other.record_dimension)) if not name in self.dimensions: self.create_dimension(name, length) else: diff --git a/src/polyglot/variable.py b/src/polyglot/variable.py index 2bababbfae2..d92d1aee157 100644 --- a/src/polyglot/variable.py +++ b/src/polyglot/variable.py @@ -90,6 +90,18 @@ def __eq__(self, other): return True +def _expand_key(key, ndim): + """Given a key for getting an item from an ndarray, expand the key to an + equivalent key which is a tuple with length equal to the number of + dimensions + """ + if not isinstance(key, tuple): + key = (key,) + new_key = [slice(None)] * ndim + new_key[:len(key)] = key + return tuple(new_key) + + class Variable(object): """ A netcdf-like variable consisting of dimensions, data and attributes @@ -97,66 +109,101 @@ class Variable(object): fully described outside the context of its parent Dataset. """ def __init__(self, dims, data, attributes=None): - object.__setattr__(self, 'dimensions', dims) - object.__setattr__(self, 'data', data) + if len(dims) != data.ndim: + raise ValueError('data must have same shape as the number of ' + 'dimensions') + self._dimensions = tuple(dims) + self._data = data if attributes is None: attributes = {} - object.__setattr__(self, 'attributes', AttributesDict(attributes)) + self._attributes = AttributesDict(attributes) - def _allocate(self): - return self.__class__(dims=(), data=0) + @property + def dimensions(self): + return self._dimensions - def __getattribute__(self, key): + @property + def data(self): """ - Here we give some of the attributes of self.data preference over - attributes in the object instelf. + The variable's data as a numpy.ndarray """ - if key in ['dtype', 'shape', 'size', 'ndim', 'nbytes', - 'flat', '__iter__', 'view']: - return getattr(self.data, key) - else: - return object.__getattribute__(self, key) - - def __setattr__(self, attr, value): - """"__setattr__ is overloaded to prevent operations that could - cause loss of data consistency. If you really intend to update - dir(self), use the self.__dict__.update method or the - super(type(a), self).__setattr__ method to bypass.""" - raise AttributeError, "Object is tamper-proof" + if not isinstance(self._data, np.ndarray): + self._data = np.asarray(self._data[...]) + return self._data + + @data.setter + def data(self, value): + value = np.asarray(value) + if value.shape != self.shape: + raise ValueError("replacement data must match the Variable's " + "shape") + self._data = value + + @property + def dtype(self): + return self._data.dtype + + @property + def shape(self): + return self._data.shape + + @property + def size(self): + return self._data.size + + @property + def ndim(self): + return self._data.ndim - def __delattr__(self, attr): - raise AttributeError, "Object is tamper-proof" + def __len__(self): + return len(self._data) - def __getitem__(self, index): - """__getitem__ is overloaded to access the underlying numpy data""" - return self.data[index] + def __getitem__(self, key): + """ + Return a new Variable object whose contents are consistent with getting + the provided key from the underlying data + """ + key = _expand_key(key, self.ndim) + dimensions = [dim for k, dim in zip(key, self.dimensions) + if not isinstance(k, int)] + return Variable(dimensions, self._data[key], self.attributes) - def __setitem__(self, index, data): + def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data""" - self.data[index] = data + self.data[key] = value + + def __iter__(self): + """ + Iterate over the contents of this Variable + """ + for n in range(len(self)): + yield self[n] - def __hash__(self): - """__hash__ is overloaded to guarantee that two variables with the same - attributes and np.data values have the same hash (the converse is not true)""" - return hash((self.dimensions, - frozenset((k,v.tostring()) if isinstance(v,np.ndarray) else (k,v) - for (k,v) in self.attributes.items()), - self.data.tostring())) + @property + def attributes(self): + return self._attributes - def __len__(self): - """__len__ is overloaded to access the underlying numpy data""" - return self.data.__len__() + def copy(self): + """ + Returns a shallow copy of the current object. + """ + return self.__copy__() + + def _copy(self, deepcopy=False): + # deepcopies should always be of a numpy view of the data, not the data + # itself, because non-memory backends don't necessarily have deepcopy + # defined sensibly (this is a problem for netCDF4 variables) + data = copy.deepcopy(self.data) if deepcopy else self._data + # note: + # dimensions is already an immutable tuple + # attributes will be copied when the new Variable is created + return Variable(self.dimensions, data, self.attributes) def __copy__(self): """ Returns a shallow copy of the current object. """ - # Create the simplest possible dummy object and then overwrite it - obj = self._allocate() - object.__setattr__(obj, 'dimensions', self.dimensions) - object.__setattr__(obj, 'data', self.data) - object.__setattr__(obj, 'attributes', self.attributes) - return obj + return self._copy(deepcopy=False) def __deepcopy__(self, memo=None): """ @@ -164,24 +211,21 @@ def __deepcopy__(self, memo=None): memo does nothing but is required for compatability with copy.deepcopy """ - # Create the simplest possible dummy object and then overwrite it - obj = self._allocate() - # tuples are immutable - object.__setattr__(obj, 'dimensions', self.dimensions) - object.__setattr__(obj, 'data', self.data[:].copy()) - object.__setattr__(obj, 'attributes', self.attributes.copy()) - return obj + return self._copy(deepcopy=True) + + # mutable objects should not be hashable + __hash__ = None def __eq__(self, other): - if self.dimensions != other.dimensions or \ - (self.data.tostring() != other.data.tostring()): - return False - if not self.attributes == other.attributes: + try: + return (self.dimensions == other.dimensions + and np.all(self.data == other.data) + and self.attributes == other.attributes) + except AttributeError: return False - return True def __ne__(self, other): - return not self.__eq__(other) + return not self == other def __str__(self): """Create a ncdump-like summary of the object""" @@ -230,10 +274,7 @@ def views(self, slicers): for i, dim in enumerate(self.dimensions): if dim in slicers: slices[i] = slicers[dim] - # Shallow copy - obj = copy.copy(self) - object.__setattr__(obj, 'data', self.data[slices]) - return obj + return self[tuple(slices)] def view(self, s, dim): """Return a new Variable object whose contents are a view of the object @@ -244,9 +285,7 @@ def view(self, s, dim): s : slice The slice representing the range of the values to extract. dim : string - The dimension to slice along. If multiple dimensions equal - dim (e.g. a correlation matrix), then the slicing is done - only along the first matching dimension. + The dimension to slice along. Returns ------- @@ -261,7 +300,7 @@ def view(self, s, dim): -------- take """ - return self.views({dim : s}) + return self.views({dim: s}) def take(self, indices, dim): """Return a new Variable object whose contents are sliced from @@ -293,65 +332,7 @@ def take(self, indices, dim): raise ValueError('indices should have a single dimension') # When dim appears repeatedly in self.dimensions, using the index() # method gives us only the first one, which is the desired behavior - axis = list(self.dimensions).index(dim) - # Deep copy - obj = copy.deepcopy(self) - # In case data is lazy we need to slice out all the data before taking. - object.__setattr__(obj, 'data', self.data[:].take(indices, axis=axis)) - return obj - -class LazyVariableData(object): - """ - This object wraps around a Variable object (though - it only really makes sense to use it with a class that - extends variable.Variable). The result mascarades as - variable data, but doesn't actually try accessing the - data until indexing is attempted. - - For example, imagine you have some variable that was - derived from an opendap dataset, 'nc'. - - var = nc['massive_variable'] - - if you wanted to check the data type of var: - - var.data.dtype - - you would find that it might involve downloading all - of the actual data, then inspecting the resulting - numpy array. But with this wrapper calling: - - nc['large_variable'].data.someattribute - - will first inspect the Variable object to see if it has - the desired attribute and only then will it suck down the - actual numpy array and request 'someattribute'. - """ - def __init__(self, lazy_variable): - self.lazyvar = lazy_variable - - def __eq__(self, other): - return self.lazyvar[:] == other - - def __ne__(self, other): - return self.lazyvar[:] != other - - def __getitem__(self, key): - return self.lazyvar[key] - - def __setitem__(self, key, value): - if not isinstance(self.lazyvar, Variable): - self.lazyvar = Variable(self.lazyvar.dimensions, - data = self.lazyvar[:], - dtype = self.lazyvar.dtype, - shape = self.lazyvar.shape, - attributes = self.lazyvar.attributes) - self.lazyvar.__setitem__(key, value) - - def __getattr__(self, attr): - """__getattr__ is overloaded to selectively expose some of the - attributes of the underlying lazy variable""" - if hasattr(self.lazyvar, attr): - return getattr(self.lazyvar, attr) - else: - return getattr(self.lazyvar[:], attr) \ No newline at end of file + axis = self.dimensions.index(dim) + # take only works on actual numpy arrays + data = self.data.take(indices, axis=axis) + return Variable(self.dimensions, data, self.attributes) diff --git a/test/test_data.py b/test/test_data.py index fc9023ab954..83eeea1a0d9 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -1,7 +1,6 @@ import unittest import os.path import numpy as np -import scipy.interpolate from copy import deepcopy from cStringIO import StringIO @@ -93,8 +92,8 @@ def test_variable(self): a.create_variable(name='bar', dims=('time', 'x',), data=d) # order of creation is preserved self.assertTrue(a.variables.keys() == ['foo', 'bar']) - self.assertTrue(all([a['foo'][i] == d[i] - for i in np.ndindex(*d.shape)])) + self.assertTrue(all([a['foo'][i].data == d[i] + for i in np.ndindex(*d.shape)])) # prevent duplicate creation self.assertRaises(ValueError, a.create_variable, name='foo', dims=('time', 'x',), data=d) @@ -122,8 +121,6 @@ def test_variable(self): self.assertFalse(v1 == v3) self.assertFalse(v1 == v4) self.assertFalse(v1 == v5) - # Variable hash - self.assertEquals(hash(v1), hash(v2)) def test_coordinate(self): a = Dataset() diff --git a/test/test_variable.py b/test/test_variable.py new file mode 100644 index 00000000000..0af5990fa4c --- /dev/null +++ b/test/test_variable.py @@ -0,0 +1,49 @@ +import unittest +import numpy as np + +import polyglot + + +class TestVariable(unittest.TestCase): + def setUp(self): + self.d = np.random.random((10, 3)) + + def test_data(self): + v = polyglot.Variable(['time', 'x'], self.d, {'foo': 'bar'}) + self.assertIs(v.data, self.d) + with self.assertRaises(ValueError): + # wrong size + v.data = np.random.random(5) + d2 = np.random.random((10, 3)) + v.data = d2 + self.assertIs(v.data, d2) + + def test_properties(self): + v = polyglot.Variable(['time', 'x'], self.d, {'foo': 'bar'}) + self.assertEqual(v.dimensions, ('time', 'x')) + self.assertEqual(v.dtype, float) + self.assertEqual(v.shape, (10, 3)) + self.assertEqual(v.size, 30) + self.assertEqual(v.ndim, 2) + self.assertEqual(len(v), 10) + self.assertEqual(v.attributes, {'foo': u'bar'}) + + def test_items(self): + v = polyglot.Variable(['time', 'x'], self.d) + self.assertEqual(v, v[:]) + self.assertEqual(v, v[...]) + self.assertEqual(polyglot.Variable(['x'], self.d[0]), v[0]) + self.assertEqual(polyglot.Variable(['time'], self.d[:, 0]), v[:, 0]) + self.assertEqual(polyglot.Variable(['time', 'x'], self.d[:3, :2]), + v[:3, :2]) + self.assertItemsEqual( + [polyglot.Variable(['x'], self.d[i]) for i in range(10)], v) + v.data[:] = 0 + self.assertTrue(np.all(v.data == 0)) + + def test_views(self): + v = polyglot.Variable(['time', 'x'], self.d) + self.assertEqual(v.views({'time': slice(None)}), v) + self.assertEqual(v.views({'time': 0}), v[0]) + self.assertEqual(v.views({'time': slice(0, 3)}), v[:3]) + self.assertEqual(v.views({'x': 0}), v[:, 0])