Skip to content

Commit 9e8707d

Browse files
TomNicholasshoyer
authored andcommitted
Feature: N-dimensional auto_combine (#2553)
* concatenates along a single dimension * Wrote function to find correct tile_IDs from nested list of datasets * Wrote function to check that combined_tile_ids structure is valid * Added test of 2d-concatenation * Tests now check that dataset ordering is correct * Test concatentation along a new dimension * Started generalising auto_combine to N-D by integrating the N-D concatentation algorithm * All unit tests now passing * Fixed a failing test which I didn't notice because I don't have pseudoNetCDF * Began updating open_mfdataset to handle N-D input * Refactored to remove duplicate logic in open_mfdataset & auto_combine * Implemented Shoyers suggestion in #2553 to rewrite the recursive nested list traverser as an iterator * --amend * Now raises ValueError if input not ordered correctly before concatenation * Added some more prototype tests defining desired behaviour more clearly * Now raises informative errors on invalid forms of input * Refactoring to alos merge along each dimension * Refactored to literally just apply the old auto_combine along each dimension * Added unit tests for open_mfdatset * Removed TODOs * Removed format strings * test_get_new_tile_ids now doesn't assume dicts are ordered * Fixed failing tests on python3.5 caused by accidentally assuming dict was ordered * Test for getting new tile id * Fixed itertoolz import so that it's compatible with older versions * Increased test coverage * Added toolz as an explicit dependency to pass tests on python2.7 * Updated 'what's new' * No longer attempts to shortcut all concatenation at once if concat_dims=None * Rewrote using itertools.groupby instead of toolz.itertoolz.groupby to remove hidden dependency on toolz * Fixed erroneous removal of utils import * Updated docstrings to include an example of multidimensional concatenation * Clarified auto_combine docstring for N-D behaviour * Added unit test for nested list of Datasets with different variables * Minor spelling and pep8 fixes * Reverted API so that N-D generalisation is hidden * Removed infer_order_from_coords argument
1 parent 2223445 commit 9e8707d

File tree

7 files changed

+623
-48
lines changed

7 files changed

+623
-48
lines changed

doc/whats-new.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ What's New
2121
always be available to python 2.7 users. For more information see the
2222
following references
2323

24-
- `Xarray Github issue discussing dropping Python 2 <https://github.com/pydata/xarray/issues/1829>`__
24+
- `Xarray Github issue discussing dropping Python 2 <https://github.com/pydata/xarray/issues/1829>`__
2525
- `Python 3 Statement <http://www.python3statement.org/>`__
2626
- `Tips on porting to Python 3 <https://docs.python.org/3/howto/pyporting.html>`__
2727

xarray/backends/api.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from .. import Dataset, backends, conventions
1212
from ..core import indexing
13-
from ..core.combine import auto_combine
13+
from ..core.combine import _infer_concat_order_from_positions, _auto_combine
1414
from ..core.pycompat import basestring, path_type
1515
from ..core.utils import close_on_error, is_remote_uri, is_grib_path
1616
from .common import ArrayWriter
@@ -485,10 +485,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
485485
lock=None, data_vars='all', coords='different',
486486
autoclose=None, parallel=False, **kwargs):
487487
"""Open multiple files as a single dataset.
488-
489488
Requires dask to be installed. See documentation for details on dask [1].
490489
Attributes from the first dataset file are used for the combined dataset.
491-
492490
Parameters
493491
----------
494492
paths : str or sequence
@@ -515,7 +513,6 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
515513
'no_conflicts'}, optional
516514
String indicating how to compare variables of the same name for
517515
potential conflicts when merging:
518-
519516
- 'broadcast_equals': all values must be equal when variables are
520517
broadcast against each other to ensure common dimensions.
521518
- 'equals': all values and dimensions must be the same.
@@ -578,6 +575,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
578575
579576
References
580577
----------
578+
581579
.. [1] http://xarray.pydata.org/en/stable/dask.html
582580
.. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance
583581
"""
@@ -594,6 +592,25 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
594592
if not paths:
595593
raise IOError('no files to open')
596594

595+
# Coerce 1D input into ND to maintain backwards-compatible API until API
596+
# for N-D combine decided
597+
# (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746)
598+
if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT:
599+
concat_dims = concat_dim
600+
elif not isinstance(concat_dim, list):
601+
concat_dims = [concat_dim]
602+
else:
603+
concat_dims = concat_dim
604+
infer_order_from_coords = False
605+
606+
# If infer_order_from_coords=True then this is unnecessary, but quick.
607+
# If infer_order_from_coords=False then this creates a flat list which is
608+
# easier to iterate over, while saving the originally-supplied structure
609+
combined_ids_paths, concat_dims = _infer_concat_order_from_positions(
610+
paths, concat_dims)
611+
ids, paths = (
612+
list(combined_ids_paths.keys()), list(combined_ids_paths.values()))
613+
597614
open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock,
598615
autoclose=autoclose, **kwargs)
599616

@@ -618,15 +635,17 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
618635
# the underlying datasets will still be stored as dask arrays
619636
datasets, file_objs = dask.compute(datasets, file_objs)
620637

621-
# close datasets in case of a ValueError
638+
# Close datasets in case of a ValueError
622639
try:
623-
if concat_dim is _CONCAT_DIM_DEFAULT:
624-
combined = auto_combine(datasets, compat=compat,
625-
data_vars=data_vars, coords=coords)
626-
else:
627-
combined = auto_combine(datasets, concat_dim=concat_dim,
628-
compat=compat,
629-
data_vars=data_vars, coords=coords)
640+
if infer_order_from_coords:
641+
# Discard ordering because it should be redone from coordinates
642+
ids = False
643+
644+
combined = _auto_combine(datasets, concat_dims=concat_dims,
645+
compat=compat,
646+
data_vars=data_vars, coords=coords,
647+
infer_order_from_coords=infer_order_from_coords,
648+
ids=ids)
630649
except ValueError:
631650
for ds in datasets:
632651
ds.close()

xarray/core/combine.py

Lines changed: 199 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import absolute_import, division, print_function
22

33
import warnings
4+
import itertools
5+
from collections import Counter
46

57
import pandas as pd
68

@@ -369,24 +371,195 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
369371
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
370372

371373

372-
def auto_combine(datasets,
373-
concat_dim=_CONCAT_DIM_DEFAULT,
374-
compat='no_conflicts',
375-
data_vars='all', coords='different'):
376-
"""Attempt to auto-magically combine the given datasets into one.
374+
def _infer_concat_order_from_positions(datasets, concat_dims):
375+
376+
combined_ids = OrderedDict(_infer_tile_ids_from_nested_list(datasets, ()))
377+
378+
tile_id, ds = list(combined_ids.items())[0]
379+
n_dims = len(tile_id)
380+
if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None:
381+
concat_dims = [concat_dims]*n_dims
382+
else:
383+
if len(concat_dims) != n_dims:
384+
raise ValueError("concat_dims has length {} but the datasets "
385+
"passed are nested in a {}-dimensional "
386+
"structure".format(str(len(concat_dims)),
387+
str(n_dims)))
388+
389+
return combined_ids, concat_dims
390+
391+
392+
def _infer_tile_ids_from_nested_list(entry, current_pos):
393+
"""
394+
Given a list of lists (of lists...) of objects, returns a iterator
395+
which returns a tuple containing the index of each object in the nested
396+
list structure as the key, and the object. This can then be called by the
397+
dict constructor to create a dictionary of the objects organised by their
398+
position in the original nested list.
399+
400+
Recursively traverses the given structure, while keeping track of the
401+
current position. Should work for any type of object which isn't a list.
402+
403+
Parameters
404+
----------
405+
entry : list[list[obj, obj, ...]]
406+
List of lists of arbitrary depth, containing objects in the order
407+
they are to be concatenated.
408+
409+
Returns
410+
-------
411+
combined_tile_ids : dict[tuple(int, ...), obj]
412+
"""
413+
414+
if isinstance(entry, list):
415+
for i, item in enumerate(entry):
416+
for result in _infer_tile_ids_from_nested_list(item,
417+
current_pos + (i,)):
418+
yield result
419+
else:
420+
yield current_pos, entry
421+
422+
423+
def _check_shape_tile_ids(combined_tile_ids):
424+
tile_ids = combined_tile_ids.keys()
425+
426+
# Check all tuples are the same length
427+
# i.e. check that all lists are nested to the same depth
428+
nesting_depths = [len(tile_id) for tile_id in tile_ids]
429+
if not set(nesting_depths) == {nesting_depths[0]}:
430+
raise ValueError("The supplied objects do not form a hypercube because"
431+
" sub-lists do not have consistent depths")
432+
433+
# Check all lists along one dimension are same length
434+
for dim in range(nesting_depths[0]):
435+
indices_along_dim = [tile_id[dim] for tile_id in tile_ids]
436+
occurrences = Counter(indices_along_dim)
437+
if len(set(occurrences.values())) != 1:
438+
raise ValueError("The supplied objects do not form a hypercube "
439+
"because sub-lists do not have consistent "
440+
"lengths along dimension" + str(dim))
441+
442+
443+
def _combine_nd(combined_ids, concat_dims, data_vars='all',
444+
coords='different', compat='no_conflicts'):
445+
"""
446+
Concatenates and merges an N-dimensional structure of datasets.
447+
448+
No checks are performed on the consistency of the datasets, concat_dims or
449+
tile_IDs, because it is assumed that this has already been done.
450+
451+
Parameters
452+
----------
453+
combined_ids : Dict[Tuple[int, ...]], xarray.Dataset]
454+
Structure containing all datasets to be concatenated with "tile_IDs" as
455+
keys, which specify position within the desired final combined result.
456+
concat_dims : sequence of str
457+
The dimensions along which the datasets should be concatenated. Must be
458+
in order, and the length must match
459+
460+
Returns
461+
-------
462+
combined_ds : xarray.Dataset
463+
"""
464+
465+
# Perform N-D dimensional concatenation
466+
# Each iteration of this loop reduces the length of the tile_ids tuples
467+
# by one. It always combines along the first dimension, removing the first
468+
# element of the tuple
469+
for concat_dim in concat_dims:
470+
combined_ids = _auto_combine_all_along_first_dim(combined_ids,
471+
dim=concat_dim,
472+
data_vars=data_vars,
473+
coords=coords,
474+
compat=compat)
475+
combined_ds = list(combined_ids.values())[0]
476+
return combined_ds
477+
478+
479+
def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars,
480+
coords, compat):
481+
# Group into lines of datasets which must be combined along dim
482+
# need to sort by _new_tile_id first for groupby to work
483+
# TODO remove all these sorted OrderedDicts once python >= 3.6 only
484+
combined_ids = OrderedDict(sorted(combined_ids.items(), key=_new_tile_id))
485+
grouped = itertools.groupby(combined_ids.items(), key=_new_tile_id)
486+
487+
new_combined_ids = {}
488+
for new_id, group in grouped:
489+
combined_ids = OrderedDict(sorted(group))
490+
datasets = combined_ids.values()
491+
new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat,
492+
data_vars, coords)
493+
return new_combined_ids
494+
495+
496+
def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
497+
compat='no_conflicts',
498+
data_vars='all', coords='different'):
499+
# This is just the old auto_combine function (which only worked along 1D)
500+
if concat_dim is not None:
501+
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
502+
grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds)))
503+
concatenated = [_auto_concat(list(ds_group), dim=dim,
504+
data_vars=data_vars, coords=coords)
505+
for id, ds_group in grouped]
506+
else:
507+
concatenated = datasets
508+
merged = merge(concatenated, compat=compat)
509+
return merged
510+
511+
512+
def _new_tile_id(single_id_ds_pair):
513+
tile_id, ds = single_id_ds_pair
514+
return tile_id[1:]
515+
516+
517+
def _auto_combine(datasets, concat_dims, compat, data_vars, coords,
518+
infer_order_from_coords, ids):
519+
"""
520+
Calls logic to decide concatenation order before concatenating.
521+
"""
522+
523+
# Arrange datasets for concatenation
524+
if infer_order_from_coords:
525+
raise NotImplementedError
526+
# TODO Use coordinates to determine tile_ID for each dataset in N-D
527+
# Ignore how they were ordered previously
528+
# Should look like:
529+
# combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets,
530+
# concat_dims)
531+
else:
532+
# Use information from the shape of the user input
533+
if not ids:
534+
# Determine tile_IDs by structure of input in N-D
535+
# (i.e. ordering in list-of-lists)
536+
combined_ids, concat_dims = _infer_concat_order_from_positions\
537+
(datasets, concat_dims)
538+
else:
539+
# Already sorted so just use the ids already passed
540+
combined_ids = OrderedDict(zip(ids, datasets))
541+
542+
# Check that the inferred shape is combinable
543+
_check_shape_tile_ids(combined_ids)
377544

545+
# Repeatedly concatenate then merge along each dimension
546+
combined = _combine_nd(combined_ids, concat_dims, compat=compat,
547+
data_vars=data_vars, coords=coords)
548+
return combined
549+
550+
551+
def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
552+
compat='no_conflicts', data_vars='all', coords='different'):
553+
"""Attempt to auto-magically combine the given datasets into one.
378554
This method attempts to combine a list of datasets into a single entity by
379555
inspecting metadata and using a combination of concat and merge.
380-
381556
It does not concatenate along more than one dimension or sort data under
382557
any circumstances. It does align coordinates, but different variables on
383558
datasets can cause it to fail under some scenarios. In complex cases, you
384559
may need to clean up your data and use ``concat``/``merge`` explicitly.
385-
386560
``auto_combine`` works well if you have N years of data and M data
387561
variables, and each combination of a distinct time period and set of data
388562
variables is saved its own dataset.
389-
390563
Parameters
391564
----------
392565
datasets : sequence of xarray.Dataset
@@ -404,7 +577,6 @@ def auto_combine(datasets,
404577
'no_conflicts'}, optional
405578
String indicating how to compare variables of the same name for
406579
potential conflicts:
407-
408580
- 'broadcast_equals': all values must be equal when variables are
409581
broadcast against each other to ensure common dimensions.
410582
- 'equals': all values and dimensions must be the same.
@@ -415,9 +587,8 @@ def auto_combine(datasets,
415587
of all non-null values.
416588
data_vars : {'minimal', 'different', 'all' or list of str}, optional
417589
Details are in the documentation of concat
418-
coords : {'minimal', 'different', 'all' o list of str}, optional
419-
Details are in the documentation of concat
420-
590+
coords : {'minimal', 'different', 'all' or list of str}, optional
591+
Details are in the documentation of conca
421592
Returns
422593
-------
423594
combined : xarray.Dataset
@@ -427,15 +598,20 @@ def auto_combine(datasets,
427598
concat
428599
Dataset.merge
429600
"""
430-
from toolz import itertoolz
431-
if concat_dim is not None:
432-
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
433-
grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
434-
datasets).values()
435-
concatenated = [_auto_concat(ds, dim=dim,
436-
data_vars=data_vars, coords=coords)
437-
for ds in grouped]
601+
602+
# Coerce 1D input into ND to maintain backwards-compatible API until API
603+
# for N-D combine decided
604+
# (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746)
605+
if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT:
606+
concat_dims = concat_dim
607+
elif not isinstance(concat_dim, list):
608+
concat_dims = [concat_dim]
438609
else:
439-
concatenated = datasets
440-
merged = merge(concatenated, compat=compat)
441-
return merged
610+
concat_dims = concat_dim
611+
infer_order_from_coords = False
612+
613+
# The IDs argument tells _auto_combine that the datasets are not yet sorted
614+
return _auto_combine(datasets, concat_dims=concat_dims, compat=compat,
615+
data_vars=data_vars, coords=coords,
616+
infer_order_from_coords=infer_order_from_coords,
617+
ids=False)

xarray/testing.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,11 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True):
138138
else:
139139
raise TypeError('{} not supported by assertion comparison'
140140
.format(type(a)))
141+
142+
143+
def assert_combined_tile_ids_equal(dict1, dict2):
144+
assert len(dict1) == len(dict2)
145+
for k, v in dict1.items():
146+
assert k in dict2.keys()
147+
assert_equal(dict1[k], dict2[k])
148+

xarray/tests/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from xarray.core import utils
1616
from xarray.core.indexing import ExplicitlyIndexed
1717
from xarray.testing import (assert_equal, assert_identical, # noqa: F401
18-
assert_allclose)
18+
assert_allclose, assert_combined_tile_ids_equal)
1919
from xarray.plot.utils import import_seaborn
2020

2121
try:

0 commit comments

Comments
 (0)