Skip to content

Vectorized lazy indexing #1899

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 37 commits into from
Mar 6, 2018
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
dceb298
Start working
fujiisoup Feb 9, 2018
c1b4b60
First support of lazy vectorized indexing.
fujiisoup Feb 9, 2018
d989a15
Some optimization.
fujiisoup Feb 9, 2018
218763c
Use unique to decompose vectorized indexing.
fujiisoup Feb 10, 2018
541fca3
Consolidate vectorizedIndexing
fujiisoup Feb 10, 2018
3e05a16
Support vectorized_indexing in h5py
fujiisoup Feb 11, 2018
030a2c4
Refactoring backend array. Added indexing.decompose_indexers. Drop un…
fujiisoup Feb 13, 2018
b9f97b4
typo
fujiisoup Feb 14, 2018
850f29c
bugfix and typo
fujiisoup Feb 14, 2018
943ec78
Fix based on @WeatherGod comments.
fujiisoup Feb 14, 2018
91aae64
Use enum-like object to indicate indexing-support types.
fujiisoup Feb 15, 2018
991c1da
Update test_decompose_indexers.
fujiisoup Feb 15, 2018
936954a
Bugfix and benchmarks.
fujiisoup Feb 15, 2018
9144965
fix: support outer/basic indexer in LazilyVectorizedIndexedArray
fujiisoup Feb 16, 2018
d1cb976
Merge branch 'master' into vectorized_lazy_indexing
fujiisoup Feb 16, 2018
95e1f1c
More comments.
fujiisoup Feb 16, 2018
180c4f5
Fixing style errors.
stickler-ci Feb 16, 2018
dbbe531
Remove unintended dupicate
fujiisoup Feb 16, 2018
c2e61ad
combine indexers for on-memory np.ndarray.
fujiisoup Feb 16, 2018
b545c3e
fix whats new
fujiisoup Feb 16, 2018
872de73
fix pydap
fujiisoup Feb 16, 2018
bb5d1f6
Update comments.
fujiisoup Feb 16, 2018
cfe29bb
Support VectorizedIndexing for rasterio. Some bugfix.
fujiisoup Feb 17, 2018
17a7dac
Merge branch 'master' into vectorized_lazy_indexing
fujiisoup Feb 18, 2018
2dff278
flake8
fujiisoup Feb 18, 2018
ead6327
More tests
fujiisoup Feb 18, 2018
a90ac05
Use LazilyIndexedArray for scalar array instead of loading.
fujiisoup Feb 18, 2018
73f4958
Support negative step slice in rasterio.
fujiisoup Feb 18, 2018
fd04966
Make slice-step always positive
fujiisoup Feb 18, 2018
b3c3d80
Bugfix in slice-slice
fujiisoup Feb 25, 2018
259f36c
Add pydap support.
fujiisoup Feb 25, 2018
0e7eb2e
Merge branch 'master' into vectorized_lazy_indexing
fujiisoup Feb 26, 2018
0c2e31b
Merge branch 'master' into vectorized_lazy_indexing
fujiisoup Mar 1, 2018
d8421a5
Merge branch 'master' into vectorized_lazy_indexing
fujiisoup Mar 1, 2018
7e0959c
Rename LazilyIndexedArray -> LazilyOuterIndexedArray. Remove duplicat…
fujiisoup Mar 2, 2018
4fccdee
flake8
fujiisoup Mar 2, 2018
8e96710
Added transpose to LazilyOuterIndexedArray
fujiisoup Mar 3, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Documentation

Enhancements
~~~~~~~~~~~~
- Lazy vectorized-indexing support. (:issue:`1897`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
- reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype``
arguments. (:issue:`1838`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
Expand Down
34 changes: 28 additions & 6 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,35 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array):
def __getitem__(self, key):
key = indexing.unwrap_explicit_indexer(
key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer))
# h5py requires using lists for fancy indexing:
# https://github.com/h5py/h5py/issues/992
# OuterIndexer only holds 1D integer ndarrays, so it's safe to convert
# them to lists.
key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key)

# Convert array-indexers to slice except most effective one.
key = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k
for k, s in zip(key, self.shape)]
gains = [(np.max(k) - np.min(k) + 1.0) / len(np.unique(k))
if isinstance(k, np.ndarray) else 0 for k in key]
array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None
proper_key, eager_key = [], []
for i, k in enumerate(key):
if isinstance(k, np.ndarray) and i != array_index:
proper_key.append(slice(np.min(k), np.max(k) + 1))
eager_key.append(k - np.min(k))
elif isinstance(k, np.ndarray):
# h5py requires increasing, non-duplicated key
pkey, ekey = np.unique(k, return_inverse=True)
# h5py requires using lists for fancy indexing:
# https://github.com/h5py/h5py/issues/992
proper_key.append(list(pkey))
eager_key.append(ekey)
else:
proper_key.append(k)
eager_key.append(slice(None))

with self.datastore.ensure_open(autoclose=True):
return self.get_array()[key]
array = np.asarray(self.get_array()[tuple(proper_key)],
dtype=self.dtype)

key = indexing._outer_to_numpy_indexer(tuple(eager_key), self.shape)
return array[key]


def maybe_decode_bytes(txt):
Expand Down
115 changes: 104 additions & 11 deletions xarray/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,13 +485,6 @@ def __init__(self, array, key=None):
self.key = key

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should we rename the original LazilyIndexedArray to LazilyOuterIndexedArray? Also, should we add a transpose method?

def _updated_key(self, new_key):
# TODO should suport VectorizedIndexer
if isinstance(new_key, VectorizedIndexer):
raise NotImplementedError(
'Vectorized indexing for {} is not implemented. Load your '
'data first with .load() or .compute(), or disable caching by '
'setting cache=False in open_dataset.'.format(type(self)))

iter_new_key = iter(expanded_indexer(new_key.tuple, self.ndim))
full_key = []
for size, k in zip(self.array.shape, self.key.tuple):
Expand Down Expand Up @@ -520,9 +513,16 @@ def __array__(self, dtype=None):
return np.asarray(array[self.key], dtype=None)

def __getitem__(self, indexer):
if isinstance(indexer, VectorizedIndexer):
array = LazilyVectorizedIndexedArray(self.array, self.key)
return array[indexer]
return type(self)(self.array, self._updated_key(indexer))

def __setitem__(self, key, value):
if isinstance(key, VectorizedIndexer):
raise NotImplementedError(
'Lazy item assignment with the vectorized indexer is not yet '
'implemented. Load your data first by .load() or compute().')
full_key = self._updated_key(key)
self.array[full_key] = value

Expand All @@ -531,6 +531,60 @@ def __repr__(self):
(type(self).__name__, self.array, self.key))


class LazilyVectorizedIndexedArray(ExplicitlyIndexedNDArrayMixin):
"""Wrap an array to make vectorized indexing lazy.
"""

def __init__(self, array, key):
"""
Parameters
----------
array : array_like
Array like object to index.
key : VectorizedIndexer
"""
if isinstance(key, (BasicIndexer, OuterIndexer)):
self.key = VectorizedIndexer(
_outer_to_vectorized_indexer(key.tuple, array.shape))
else:
self.key = _arrayize_vectorized_indexer(key, array.shape)
self.array = as_indexable(array)
self.order = np.arange(self.ndim)

@property
def shape(self):
return np.broadcast(*self.key.tuple).shape

def __array__(self, dtype=None):
try:
array = np.asarray(self.array[self.key], dtype=None)
except NotImplementedError:
# if vectorized indexing is not supported
oind, vind = _decompose_vectorized_indexer(self.key)
array = NumpyIndexingAdapter(np.asarray(self.array[oind],
dtype=None))[vind]
return array

def _updated_key(self, new_key):
return VectorizedIndexer(tuple(o[new_key.tuple] for o in
np.broadcast_arrays(*self.key.tuple)))

def __getitem__(self, indexer):
return type(self)(self.array, self._updated_key(indexer))

def transpose(self, order):
key = VectorizedIndexer(tuple(
k.transpose(order) for k in self.key.tuple))
return type(self)(self.array, key)

def __setitem__(self, key, value):
raise NotImplementedError
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add a more informative error message here. Even NotImplementedError('__setitem__ has not yet been implemented for LazilyVectorizedIndexedArray') would be okay.


def __repr__(self):
return ('%s(array=%r, key=%r)' %
(type(self).__name__, self.array, self.key))


def _wrap_numpy_scalars(array):
"""Wrap NumPy scalars in 0d arrays."""
if np.isscalar(array):
Expand Down Expand Up @@ -602,23 +656,23 @@ def _outer_to_vectorized_indexer(key, shape):
Parameters
----------
key : tuple
Tuple from an OuterIndexer to convert.
Tuple from an Basic/OuterIndexer to convert.
shape : tuple
Shape of the array subject to the indexing.

Returns
-------
tuple
Tuple suitable for use to index a NumPy array with vectorized indexing.
Each element is an integer or array: broadcasting them together gives
the shape of the result.
Each element is array: broadcasting them together gives the shape of
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each element is an array

the result.
"""
n_dim = len([k for k in key if not isinstance(k, integer_types)])
i_dim = 0
new_key = []
for k, size in zip(key, shape):
if isinstance(k, integer_types):
new_key.append(k)
new_key.append(np.array(k).reshape((1,) * n_dim))
else: # np.ndarray or slice
if isinstance(k, slice):
k = np.arange(*k.indices(size))
Expand Down Expand Up @@ -654,6 +708,45 @@ def _outer_to_numpy_indexer(key, shape):
return _outer_to_vectorized_indexer(key, shape)


def _decompose_vectorized_indexer(indexer):
""" Decompose vectorized indexer to outer and vectorized indexers,
array[indexer] == array[oindex][vindex]
such that array[oindex].shape becomes smallest.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

becomes smallest what?

Just trying to make sure I understand the intent here. Is this algorithm trying to optimize (minimize) the size of array[oindex]?

"""
oindex = []
vindex = []
for k in indexer.tuple:
if isinstance(k, slice):
oindex.append(k)
vindex.append(slice(None))
else: # np.ndarray
oind, vind = np.unique(k, return_inverse=True)
oindex.append(oind)
vindex.append(vind.reshape(*k.shape))
return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex))


def _arrayize_vectorized_indexer(indexer, shape):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops -- I wrote a basically identical helper function xarray.backends.zarr._replace_slices_with_arrays along with some unit tests.

Can you confirm that we can remove the zarr helper function, and that all its unit tests work with this function instead? I don't have a preference on which implementation to use, assuming that all tests pass with both.

""" Return an identical vindex but slices are replaced by arrays """
slices = [v for v in indexer.tuple if isinstance(v, slice)]
if len(slices) == 0:
return indexer

arrays = [v for v in indexer.tuple if isinstance(v, np.ndarray)]
n_dim = arrays[0].ndim if len(arrays) > 0 else 0
i_dim = 0
new_key = []
for v, size in zip(indexer.tuple, shape):
if isinstance(v, np.ndarray):
new_key.append(np.reshape(v, v.shape + (1, ) * len(slices)))
else: # slice
shape = ((1,) * (n_dim + i_dim) + (-1,) +
(1,) * (len(slices) - i_dim - 1))
new_key.append(np.arange(*v.indices(size)).reshape(shape))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the tests, v is sometimes an integer at this point, and thus has an attribute error for v.indices.
From the test TestLazyArray.test_vectorized_lazily_indexed_array:

indexer = BasicIndexer((slice(1, 3, 2), 0)), shape = (1, -1)
    def _arrayize_vectorized_indexer(indexer, shape):
        """ Return an identical vindex but slices are replaced by arrays """
        slices = [v for v in indexer.tuple if isinstance(v, slice)]
        if len(slices) == 0:
            return indexer
    
        arrays = [v for v in indexer.tuple if isinstance(v, np.ndarray)]
        n_dim = arrays[0].ndim if len(arrays) > 0 else 0
        i_dim = 0
        new_key = []
        for v, size in zip(indexer.tuple, shape):
            if isinstance(v, np.ndarray):
                new_key.append(np.reshape(v, v.shape + (1, ) * len(slices)))
            else:  # slice
                shape = ((1,) * (n_dim + i_dim) + (-1,) +
                         (1,) * (len(slices) - i_dim - 1))
>               new_key.append(np.arange(*v.indices(size)).reshape(shape))
E               AttributeError: 'int' object has no attribute 'indices'

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks.
Our VectorizedIndexer only consists of np.ndarrays and slices, it is OK here.
Indeed, the error was due to another bug.
Fixed.

i_dim += 1
return VectorizedIndexer(tuple(new_key))


def _dask_array_with_chunks_hint(array, chunks):
"""Create a dask array using the chunks hint for dimensions of size > 1."""
import dask.array as da
Expand Down
38 changes: 9 additions & 29 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,28 +411,20 @@ def test_orthogonal_indexing(self):
actual = on_disk.isel(**indexers)
assert_identical(expected, actual)

def _test_vectorized_indexing(self, vindex_support=True):
# Make sure vectorized_indexing works or at least raises
# NotImplementedError
def test_vectorized_indexing(self):
in_memory = create_test_data()
with self.roundtrip(in_memory) as on_disk:
indexers = {'dim1': DataArray([0, 2, 0], dims='a'),
'dim2': DataArray([0, 2, 3], dims='a')}
expected = in_memory.isel(**indexers)
if vindex_support:
actual = on_disk.isel(**indexers)
assert_identical(expected, actual)
# do it twice, to make sure we're switched from
# orthogonal -> numpy when we cached the values
actual = on_disk.isel(**indexers)
assert_identical(expected, actual)
else:
with raises_regex(NotImplementedError, 'Vectorized indexing '):
actual = on_disk.isel(**indexers)

def test_vectorized_indexing(self):
# This test should be overwritten if vindex is supported
self._test_vectorized_indexing(vindex_support=False)
actual = on_disk.isel(**indexers)
# make sure the array is not yet loaded
assert not actual['var1'].variable._in_memory
assert_identical(expected, actual.load())
# do it twice, to make sure we're switched from
# orthogonal -> numpy when we cached the values
actual = on_disk.isel(**indexers)
assert_identical(expected, actual)

def test_isel_dataarray(self):
# Make sure isel works lazily. GH:issue:1688
Expand Down Expand Up @@ -738,9 +730,6 @@ def test_append_with_invalid_dim_raises(self):
'Unable to update size for existing dimension'):
self.save(data, tmp_file, mode='a')

def test_vectorized_indexing(self):
self._test_vectorized_indexing(vindex_support=False)

def test_multiindex_not_implemented(self):
ds = (Dataset(coords={'y': ('x', [1, 2]), 'z': ('x', ['a', 'b'])})
.set_index(x=['y', 'z']))
Expand Down Expand Up @@ -1101,9 +1090,6 @@ def test_dataset_caching(self):
# caching behavior differs for dask
pass

def test_vectorized_indexing(self):
self._test_vectorized_indexing(vindex_support=True)


class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest):
autoclose = True
Expand Down Expand Up @@ -1220,9 +1206,6 @@ def test_chunk_encoding_with_dask(self):
with self.roundtrip(ds_chunk4) as actual:
pass

def test_vectorized_indexing(self):
self._test_vectorized_indexing(vindex_support=True)

def test_hidden_zarr_keys(self):
expected = create_test_data()
with self.create_store() as store:
Expand Down Expand Up @@ -2031,9 +2014,6 @@ def test_dataarray_compute(self):
self.assertTrue(computed._in_memory)
assert_allclose(actual, computed, decode_bytes=False)

def test_vectorized_indexing(self):
self._test_vectorized_indexing(vindex_support=True)


class DaskTestAutocloseTrue(DaskTest):
autoclose = True
Expand Down
60 changes: 60 additions & 0 deletions xarray/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,42 @@ def test_lazily_indexed_array(self):
assert isinstance(actual._data.array,
indexing.NumpyIndexingAdapter)

def test_vectorized_lazily_indexed_array(self):
original = np.random.rand(10, 20, 30)
x = indexing.NumpyIndexingAdapter(original)
v_eager = Variable(['i', 'j', 'k'], x)
lazy = indexing.LazilyIndexedArray(x)
v_lazy = Variable(['i', 'j', 'k'], lazy)
I = ReturnItem() # noqa: E741 # allow ambiguous name

def check_indexing(v_eager, v_lazy, indexers):
for indexer in indexers:
actual = v_lazy[indexer]
expected = v_eager[indexer]
assert expected.shape == actual.shape
assert isinstance(actual._data,
(indexing.LazilyVectorizedIndexedArray,
indexing.LazilyIndexedArray))
assert_array_equal(expected, actual)
v_eager = expected
v_lazy = actual

# test orthogonal indexing
indexers = [(I[:], 0, 1), (Variable('i', [0, 1]), )]
check_indexing(v_eager, v_lazy, indexers)

# vectorized indexing
indexers = [
(Variable('i', [0, 1]), Variable('i', [0, 1]), slice(None)),
(slice(1, 3, 2), 0)]
check_indexing(v_eager, v_lazy, indexers)

indexers = [
(slice(None, None, 2), 0, slice(None, 10)),
(Variable('i', [3, 2, 4, 3]), Variable('i', [3, 2, 1, 0])),
(Variable(['i', 'j'], [[0, 1], [1, 2]]), )]
check_indexing(v_eager, v_lazy, indexers)


class TestCopyOnWriteArray(TestCase):
def test_setitem(self):
Expand Down Expand Up @@ -334,6 +370,30 @@ def test_vectorized_indexer():
np.arange(5, dtype=np.int64)))


class Test_vectorized_indexer(TestCase):
def setUp(self):
self.data = indexing.NumpyIndexingAdapter(np.random.randn(10, 12, 13))
self.indexers = [np.array([[0, 3, 2], ]),
np.array([[0, 3, 3], [4, 6, 7]]),
slice(2, -2, 2), slice(2, -2, 3), slice(None)]

def test_decompose_vectorized_indexer(self):
for i, j, k in itertools.product(self.indexers, repeat=3):
vindex = indexing.VectorizedIndexer((i, j, k))
oind, vind = indexing._decompose_vectorized_indexer(vindex)
np.testing.assert_array_equal(
self.data[vindex],
indexing.NumpyIndexingAdapter(self.data[oind])[vind])

def test_arrayize_vectorized_indexer(self):
for i, j, k in itertools.product(self.indexers, repeat=3):
vindex = indexing.VectorizedIndexer((i, j, k))
vindex_array = indexing._arrayize_vectorized_indexer(
vindex, self.data.shape)
np.testing.assert_array_equal(
self.data[vindex], self.data[vindex_array],)


def test_unwrap_explicit_indexer():
indexer = indexing.BasicIndexer((1, 2))
target = None
Expand Down
6 changes: 2 additions & 4 deletions xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1893,8 +1893,7 @@ def test_NumpyIndexingAdapter(self):
def test_LazilyIndexedArray(self):
v = Variable(dims=('x', 'y'), data=LazilyIndexedArray(self.d))
self.check_orthogonal_indexing(v)
with raises_regex(NotImplementedError, 'Vectorized indexing for '):
self.check_vectorized_indexing(v)
self.check_vectorized_indexing(v)
# doubly wrapping
v = Variable(dims=('x', 'y'),
data=LazilyIndexedArray(LazilyIndexedArray(self.d)))
Expand All @@ -1912,8 +1911,7 @@ def test_CopyOnWriteArray(self):
v = Variable(dims=('x', 'y'),
data=CopyOnWriteArray(LazilyIndexedArray(self.d)))
self.check_orthogonal_indexing(v)
with raises_regex(NotImplementedError, 'Vectorized indexing for '):
self.check_vectorized_indexing(v)
self.check_vectorized_indexing(v)

def test_MemoryCachedArray(self):
v = Variable(dims=('x', 'y'), data=MemoryCachedArray(self.d))
Expand Down