1
1
from __future__ import absolute_import , division , print_function
2
2
3
3
import warnings
4
+ import itertools
5
+ from collections import Counter
4
6
5
7
import pandas as pd
6
8
@@ -369,24 +371,195 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
369
371
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
370
372
371
373
372
- def auto_combine (datasets ,
373
- concat_dim = _CONCAT_DIM_DEFAULT ,
374
- compat = 'no_conflicts' ,
375
- data_vars = 'all' , coords = 'different' ):
376
- """Attempt to auto-magically combine the given datasets into one.
374
+ def _infer_concat_order_from_positions (datasets , concat_dims ):
375
+
376
+ combined_ids = OrderedDict (_infer_tile_ids_from_nested_list (datasets , ()))
377
+
378
+ tile_id , ds = list (combined_ids .items ())[0 ]
379
+ n_dims = len (tile_id )
380
+ if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None :
381
+ concat_dims = [concat_dims ]* n_dims
382
+ else :
383
+ if len (concat_dims ) != n_dims :
384
+ raise ValueError ("concat_dims has length {} but the datasets "
385
+ "passed are nested in a {}-dimensional "
386
+ "structure" .format (str (len (concat_dims )),
387
+ str (n_dims )))
388
+
389
+ return combined_ids , concat_dims
390
+
391
+
392
+ def _infer_tile_ids_from_nested_list (entry , current_pos ):
393
+ """
394
+ Given a list of lists (of lists...) of objects, returns a iterator
395
+ which returns a tuple containing the index of each object in the nested
396
+ list structure as the key, and the object. This can then be called by the
397
+ dict constructor to create a dictionary of the objects organised by their
398
+ position in the original nested list.
399
+
400
+ Recursively traverses the given structure, while keeping track of the
401
+ current position. Should work for any type of object which isn't a list.
402
+
403
+ Parameters
404
+ ----------
405
+ entry : list[list[obj, obj, ...]]
406
+ List of lists of arbitrary depth, containing objects in the order
407
+ they are to be concatenated.
408
+
409
+ Returns
410
+ -------
411
+ combined_tile_ids : dict[tuple(int, ...), obj]
412
+ """
413
+
414
+ if isinstance (entry , list ):
415
+ for i , item in enumerate (entry ):
416
+ for result in _infer_tile_ids_from_nested_list (item ,
417
+ current_pos + (i ,)):
418
+ yield result
419
+ else :
420
+ yield current_pos , entry
421
+
422
+
423
+ def _check_shape_tile_ids (combined_tile_ids ):
424
+ tile_ids = combined_tile_ids .keys ()
425
+
426
+ # Check all tuples are the same length
427
+ # i.e. check that all lists are nested to the same depth
428
+ nesting_depths = [len (tile_id ) for tile_id in tile_ids ]
429
+ if not set (nesting_depths ) == {nesting_depths [0 ]}:
430
+ raise ValueError ("The supplied objects do not form a hypercube because"
431
+ " sub-lists do not have consistent depths" )
432
+
433
+ # Check all lists along one dimension are same length
434
+ for dim in range (nesting_depths [0 ]):
435
+ indices_along_dim = [tile_id [dim ] for tile_id in tile_ids ]
436
+ occurrences = Counter (indices_along_dim )
437
+ if len (set (occurrences .values ())) != 1 :
438
+ raise ValueError ("The supplied objects do not form a hypercube "
439
+ "because sub-lists do not have consistent "
440
+ "lengths along dimension" + str (dim ))
441
+
442
+
443
+ def _combine_nd (combined_ids , concat_dims , data_vars = 'all' ,
444
+ coords = 'different' , compat = 'no_conflicts' ):
445
+ """
446
+ Concatenates and merges an N-dimensional structure of datasets.
447
+
448
+ No checks are performed on the consistency of the datasets, concat_dims or
449
+ tile_IDs, because it is assumed that this has already been done.
450
+
451
+ Parameters
452
+ ----------
453
+ combined_ids : Dict[Tuple[int, ...]], xarray.Dataset]
454
+ Structure containing all datasets to be concatenated with "tile_IDs" as
455
+ keys, which specify position within the desired final combined result.
456
+ concat_dims : sequence of str
457
+ The dimensions along which the datasets should be concatenated. Must be
458
+ in order, and the length must match
459
+
460
+ Returns
461
+ -------
462
+ combined_ds : xarray.Dataset
463
+ """
464
+
465
+ # Perform N-D dimensional concatenation
466
+ # Each iteration of this loop reduces the length of the tile_ids tuples
467
+ # by one. It always combines along the first dimension, removing the first
468
+ # element of the tuple
469
+ for concat_dim in concat_dims :
470
+ combined_ids = _auto_combine_all_along_first_dim (combined_ids ,
471
+ dim = concat_dim ,
472
+ data_vars = data_vars ,
473
+ coords = coords ,
474
+ compat = compat )
475
+ combined_ds = list (combined_ids .values ())[0 ]
476
+ return combined_ds
477
+
478
+
479
+ def _auto_combine_all_along_first_dim (combined_ids , dim , data_vars ,
480
+ coords , compat ):
481
+ # Group into lines of datasets which must be combined along dim
482
+ # need to sort by _new_tile_id first for groupby to work
483
+ # TODO remove all these sorted OrderedDicts once python >= 3.6 only
484
+ combined_ids = OrderedDict (sorted (combined_ids .items (), key = _new_tile_id ))
485
+ grouped = itertools .groupby (combined_ids .items (), key = _new_tile_id )
486
+
487
+ new_combined_ids = {}
488
+ for new_id , group in grouped :
489
+ combined_ids = OrderedDict (sorted (group ))
490
+ datasets = combined_ids .values ()
491
+ new_combined_ids [new_id ] = _auto_combine_1d (datasets , dim , compat ,
492
+ data_vars , coords )
493
+ return new_combined_ids
494
+
495
+
496
+ def _auto_combine_1d (datasets , concat_dim = _CONCAT_DIM_DEFAULT ,
497
+ compat = 'no_conflicts' ,
498
+ data_vars = 'all' , coords = 'different' ):
499
+ # This is just the old auto_combine function (which only worked along 1D)
500
+ if concat_dim is not None :
501
+ dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
502
+ grouped = itertools .groupby (datasets , key = lambda ds : tuple (sorted (ds )))
503
+ concatenated = [_auto_concat (list (ds_group ), dim = dim ,
504
+ data_vars = data_vars , coords = coords )
505
+ for id , ds_group in grouped ]
506
+ else :
507
+ concatenated = datasets
508
+ merged = merge (concatenated , compat = compat )
509
+ return merged
510
+
511
+
512
+ def _new_tile_id (single_id_ds_pair ):
513
+ tile_id , ds = single_id_ds_pair
514
+ return tile_id [1 :]
515
+
516
+
517
+ def _auto_combine (datasets , concat_dims , compat , data_vars , coords ,
518
+ infer_order_from_coords , ids ):
519
+ """
520
+ Calls logic to decide concatenation order before concatenating.
521
+ """
522
+
523
+ # Arrange datasets for concatenation
524
+ if infer_order_from_coords :
525
+ raise NotImplementedError
526
+ # TODO Use coordinates to determine tile_ID for each dataset in N-D
527
+ # Ignore how they were ordered previously
528
+ # Should look like:
529
+ # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets,
530
+ # concat_dims)
531
+ else :
532
+ # Use information from the shape of the user input
533
+ if not ids :
534
+ # Determine tile_IDs by structure of input in N-D
535
+ # (i.e. ordering in list-of-lists)
536
+ combined_ids , concat_dims = _infer_concat_order_from_positions \
537
+ (datasets , concat_dims )
538
+ else :
539
+ # Already sorted so just use the ids already passed
540
+ combined_ids = OrderedDict (zip (ids , datasets ))
541
+
542
+ # Check that the inferred shape is combinable
543
+ _check_shape_tile_ids (combined_ids )
377
544
545
+ # Repeatedly concatenate then merge along each dimension
546
+ combined = _combine_nd (combined_ids , concat_dims , compat = compat ,
547
+ data_vars = data_vars , coords = coords )
548
+ return combined
549
+
550
+
551
+ def auto_combine (datasets , concat_dim = _CONCAT_DIM_DEFAULT ,
552
+ compat = 'no_conflicts' , data_vars = 'all' , coords = 'different' ):
553
+ """Attempt to auto-magically combine the given datasets into one.
378
554
This method attempts to combine a list of datasets into a single entity by
379
555
inspecting metadata and using a combination of concat and merge.
380
-
381
556
It does not concatenate along more than one dimension or sort data under
382
557
any circumstances. It does align coordinates, but different variables on
383
558
datasets can cause it to fail under some scenarios. In complex cases, you
384
559
may need to clean up your data and use ``concat``/``merge`` explicitly.
385
-
386
560
``auto_combine`` works well if you have N years of data and M data
387
561
variables, and each combination of a distinct time period and set of data
388
562
variables is saved its own dataset.
389
-
390
563
Parameters
391
564
----------
392
565
datasets : sequence of xarray.Dataset
@@ -404,7 +577,6 @@ def auto_combine(datasets,
404
577
'no_conflicts'}, optional
405
578
String indicating how to compare variables of the same name for
406
579
potential conflicts:
407
-
408
580
- 'broadcast_equals': all values must be equal when variables are
409
581
broadcast against each other to ensure common dimensions.
410
582
- 'equals': all values and dimensions must be the same.
@@ -415,9 +587,8 @@ def auto_combine(datasets,
415
587
of all non-null values.
416
588
data_vars : {'minimal', 'different', 'all' or list of str}, optional
417
589
Details are in the documentation of concat
418
- coords : {'minimal', 'different', 'all' o list of str}, optional
419
- Details are in the documentation of concat
420
-
590
+ coords : {'minimal', 'different', 'all' or list of str}, optional
591
+ Details are in the documentation of conca
421
592
Returns
422
593
-------
423
594
combined : xarray.Dataset
@@ -427,15 +598,20 @@ def auto_combine(datasets,
427
598
concat
428
599
Dataset.merge
429
600
"""
430
- from toolz import itertoolz
431
- if concat_dim is not None :
432
- dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
433
- grouped = itertoolz . groupby ( lambda ds : tuple ( sorted ( ds . data_vars )),
434
- datasets ). values ()
435
- concatenated = [ _auto_concat ( ds , dim = dim ,
436
- data_vars = data_vars , coords = coords )
437
- for ds in grouped ]
601
+
602
+ # Coerce 1D input into ND to maintain backwards-compatible API until API
603
+ # for N-D combine decided
604
+ # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746)
605
+ if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT :
606
+ concat_dims = concat_dim
607
+ elif not isinstance ( concat_dim , list ):
608
+ concat_dims = [ concat_dim ]
438
609
else :
439
- concatenated = datasets
440
- merged = merge (concatenated , compat = compat )
441
- return merged
610
+ concat_dims = concat_dim
611
+ infer_order_from_coords = False
612
+
613
+ # The IDs argument tells _auto_combine that the datasets are not yet sorted
614
+ return _auto_combine (datasets , concat_dims = concat_dims , compat = compat ,
615
+ data_vars = data_vars , coords = coords ,
616
+ infer_order_from_coords = infer_order_from_coords ,
617
+ ids = False )
0 commit comments