-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Vectorized lazy indexing #1899
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vectorized lazy indexing #1899
Changes from 6 commits
dceb298
c1b4b60
d989a15
218763c
541fca3
3e05a16
030a2c4
b9f97b4
850f29c
943ec78
91aae64
991c1da
936954a
9144965
d1cb976
95e1f1c
180c4f5
dbbe531
c2e61ad
b545c3e
872de73
bb5d1f6
cfe29bb
17a7dac
2dff278
ead6327
a90ac05
73f4958
fd04966
b3c3d80
259f36c
0e7eb2e
0c2e31b
d8421a5
7e0959c
4fccdee
8e96710
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -485,13 +485,6 @@ def __init__(self, array, key=None): | |
self.key = key | ||
|
||
def _updated_key(self, new_key): | ||
# TODO should suport VectorizedIndexer | ||
if isinstance(new_key, VectorizedIndexer): | ||
raise NotImplementedError( | ||
'Vectorized indexing for {} is not implemented. Load your ' | ||
'data first with .load() or .compute(), or disable caching by ' | ||
'setting cache=False in open_dataset.'.format(type(self))) | ||
|
||
iter_new_key = iter(expanded_indexer(new_key.tuple, self.ndim)) | ||
full_key = [] | ||
for size, k in zip(self.array.shape, self.key.tuple): | ||
|
@@ -520,9 +513,16 @@ def __array__(self, dtype=None): | |
return np.asarray(array[self.key], dtype=None) | ||
|
||
def __getitem__(self, indexer): | ||
if isinstance(indexer, VectorizedIndexer): | ||
array = LazilyVectorizedIndexedArray(self.array, self.key) | ||
return array[indexer] | ||
return type(self)(self.array, self._updated_key(indexer)) | ||
|
||
def __setitem__(self, key, value): | ||
if isinstance(key, VectorizedIndexer): | ||
raise NotImplementedError( | ||
'Lazy item assignment with the vectorized indexer is not yet ' | ||
'implemented. Load your data first by .load() or compute().') | ||
full_key = self._updated_key(key) | ||
self.array[full_key] = value | ||
|
||
|
@@ -531,6 +531,60 @@ def __repr__(self): | |
(type(self).__name__, self.array, self.key)) | ||
|
||
|
||
class LazilyVectorizedIndexedArray(ExplicitlyIndexedNDArrayMixin): | ||
"""Wrap an array to make vectorized indexing lazy. | ||
""" | ||
|
||
def __init__(self, array, key): | ||
""" | ||
Parameters | ||
---------- | ||
array : array_like | ||
Array like object to index. | ||
key : VectorizedIndexer | ||
""" | ||
if isinstance(key, (BasicIndexer, OuterIndexer)): | ||
self.key = VectorizedIndexer( | ||
_outer_to_vectorized_indexer(key.tuple, array.shape)) | ||
else: | ||
self.key = _arrayize_vectorized_indexer(key, array.shape) | ||
self.array = as_indexable(array) | ||
self.order = np.arange(self.ndim) | ||
|
||
@property | ||
def shape(self): | ||
return np.broadcast(*self.key.tuple).shape | ||
|
||
def __array__(self, dtype=None): | ||
try: | ||
array = np.asarray(self.array[self.key], dtype=None) | ||
except NotImplementedError: | ||
# if vectorized indexing is not supported | ||
oind, vind = _decompose_vectorized_indexer(self.key) | ||
array = NumpyIndexingAdapter(np.asarray(self.array[oind], | ||
dtype=None))[vind] | ||
return array | ||
|
||
def _updated_key(self, new_key): | ||
return VectorizedIndexer(tuple(o[new_key.tuple] for o in | ||
np.broadcast_arrays(*self.key.tuple))) | ||
|
||
def __getitem__(self, indexer): | ||
return type(self)(self.array, self._updated_key(indexer)) | ||
|
||
def transpose(self, order): | ||
key = VectorizedIndexer(tuple( | ||
k.transpose(order) for k in self.key.tuple)) | ||
return type(self)(self.array, key) | ||
|
||
def __setitem__(self, key, value): | ||
raise NotImplementedError | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add a more informative error message here. Even |
||
|
||
def __repr__(self): | ||
return ('%s(array=%r, key=%r)' % | ||
(type(self).__name__, self.array, self.key)) | ||
|
||
|
||
def _wrap_numpy_scalars(array): | ||
"""Wrap NumPy scalars in 0d arrays.""" | ||
if np.isscalar(array): | ||
|
@@ -602,23 +656,23 @@ def _outer_to_vectorized_indexer(key, shape): | |
Parameters | ||
---------- | ||
key : tuple | ||
Tuple from an OuterIndexer to convert. | ||
Tuple from an Basic/OuterIndexer to convert. | ||
shape : tuple | ||
Shape of the array subject to the indexing. | ||
|
||
Returns | ||
------- | ||
tuple | ||
Tuple suitable for use to index a NumPy array with vectorized indexing. | ||
Each element is an integer or array: broadcasting them together gives | ||
the shape of the result. | ||
Each element is array: broadcasting them together gives the shape of | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each element is an array |
||
the result. | ||
""" | ||
n_dim = len([k for k in key if not isinstance(k, integer_types)]) | ||
i_dim = 0 | ||
new_key = [] | ||
for k, size in zip(key, shape): | ||
if isinstance(k, integer_types): | ||
new_key.append(k) | ||
new_key.append(np.array(k).reshape((1,) * n_dim)) | ||
else: # np.ndarray or slice | ||
if isinstance(k, slice): | ||
k = np.arange(*k.indices(size)) | ||
|
@@ -654,6 +708,45 @@ def _outer_to_numpy_indexer(key, shape): | |
return _outer_to_vectorized_indexer(key, shape) | ||
|
||
|
||
def _decompose_vectorized_indexer(indexer): | ||
""" Decompose vectorized indexer to outer and vectorized indexers, | ||
array[indexer] == array[oindex][vindex] | ||
such that array[oindex].shape becomes smallest. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. becomes smallest what? Just trying to make sure I understand the intent here. Is this algorithm trying to optimize (minimize) the size of |
||
""" | ||
oindex = [] | ||
vindex = [] | ||
for k in indexer.tuple: | ||
if isinstance(k, slice): | ||
oindex.append(k) | ||
vindex.append(slice(None)) | ||
else: # np.ndarray | ||
oind, vind = np.unique(k, return_inverse=True) | ||
oindex.append(oind) | ||
vindex.append(vind.reshape(*k.shape)) | ||
return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex)) | ||
|
||
|
||
def _arrayize_vectorized_indexer(indexer, shape): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops -- I wrote a basically identical helper function Can you confirm that we can remove the zarr helper function, and that all its unit tests work with this function instead? I don't have a preference on which implementation to use, assuming that all tests pass with both. |
||
""" Return an identical vindex but slices are replaced by arrays """ | ||
slices = [v for v in indexer.tuple if isinstance(v, slice)] | ||
if len(slices) == 0: | ||
return indexer | ||
|
||
arrays = [v for v in indexer.tuple if isinstance(v, np.ndarray)] | ||
n_dim = arrays[0].ndim if len(arrays) > 0 else 0 | ||
i_dim = 0 | ||
new_key = [] | ||
for v, size in zip(indexer.tuple, shape): | ||
if isinstance(v, np.ndarray): | ||
new_key.append(np.reshape(v, v.shape + (1, ) * len(slices))) | ||
else: # slice | ||
shape = ((1,) * (n_dim + i_dim) + (-1,) + | ||
(1,) * (len(slices) - i_dim - 1)) | ||
new_key.append(np.arange(*v.indices(size)).reshape(shape)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the tests,
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. |
||
i_dim += 1 | ||
return VectorizedIndexer(tuple(new_key)) | ||
|
||
|
||
def _dask_array_with_chunks_hint(array, chunks): | ||
"""Create a dask array using the chunks hint for dimensions of size > 1.""" | ||
import dask.array as da | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we should we rename the original
LazilyIndexedArray
toLazilyOuterIndexedArray
? Also, should we add atranspose
method?