Skip to content

Commit 5aaa654

Browse files
zdgriffithshoyer
authored andcommitted
[WIP] Custom fill value for reindex, align, and merge operations (pydata#2920)
* add fill_value option to align and reindex functions * add fill_value tests for reindex and align * add fill_value option for merge functions * add tests for fill_value merge implementation * implement and test fill_value option in dataaarray reindex methods * fix PEP 8 issue * move function signature onto function * Add fill_value enhancement note
1 parent ccd0b04 commit 5aaa654

File tree

8 files changed

+159
-48
lines changed

8 files changed

+159
-48
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ v0.12.2 (unreleased)
2121
Enhancements
2222
~~~~~~~~~~~~
2323

24+
- Add ``fill_value`` argument for reindex, align, and merge operations
25+
to enable custom fill values. (:issue:`2876`)
26+
By `Zach Griffith <https://github.com/zdgriffith>`_.
2427
- Character arrays' character dimension name decoding and encoding handled by
2528
``var.encoding['char_dim_name']`` (:issue:`2895`)
2629
By `James McCreight <https://github.com/jmccreight>`_.

xarray/core/alignment.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
import pandas as pd
1010

11-
from . import utils
11+
from . import utils, dtypes
1212
from .indexing import get_indexer_nd
1313
from .utils import is_dict_like, is_full_slice
1414
from .variable import IndexVariable, Variable
@@ -31,20 +31,17 @@ def _get_joiner(join):
3131
raise ValueError('invalid value for join: %s' % join)
3232

3333

34-
_DEFAULT_EXCLUDE = frozenset() # type: frozenset
35-
36-
37-
def align(*objects, **kwargs):
38-
"""align(*objects, join='inner', copy=True, indexes=None,
39-
exclude=frozenset())
40-
34+
def align(*objects, join='inner', copy=True, indexes=None, exclude=frozenset(),
35+
fill_value=dtypes.NA):
36+
"""
4137
Given any number of Dataset and/or DataArray objects, returns new
4238
objects with aligned indexes and dimension sizes.
4339
4440
Array from the aligned objects are suitable as input to mathematical
4541
operators, because along each dimension they have the same index and size.
4642
47-
Missing values (if ``join != 'inner'``) are filled with NaN.
43+
Missing values (if ``join != 'inner'``) are filled with ``fill_value``.
44+
The default fill value is NaN.
4845
4946
Parameters
5047
----------
@@ -65,11 +62,13 @@ def align(*objects, **kwargs):
6562
``copy=False`` and reindexing is unnecessary, or can be performed with
6663
only slice operations, then the output may share memory with the input.
6764
In either case, new xarray objects are always returned.
68-
exclude : sequence of str, optional
69-
Dimensions that must be excluded from alignment
7065
indexes : dict-like, optional
7166
Any indexes explicitly provided with the `indexes` argument should be
7267
used in preference to the aligned indexes.
68+
exclude : sequence of str, optional
69+
Dimensions that must be excluded from alignment
70+
fill_value : scalar, optional
71+
Value to use for newly missing values
7372
7473
Returns
7574
-------
@@ -82,15 +81,8 @@ def align(*objects, **kwargs):
8281
If any dimensions without labels on the arguments have different sizes,
8382
or a different size than the size of the aligned dimension labels.
8483
"""
85-
join = kwargs.pop('join', 'inner')
86-
copy = kwargs.pop('copy', True)
87-
indexes = kwargs.pop('indexes', None)
88-
exclude = kwargs.pop('exclude', _DEFAULT_EXCLUDE)
8984
if indexes is None:
9085
indexes = {}
91-
if kwargs:
92-
raise TypeError('align() got unexpected keyword arguments: %s'
93-
% list(kwargs))
9486

9587
if not indexes and len(objects) == 1:
9688
# fast path for the trivial case
@@ -162,15 +154,17 @@ def align(*objects, **kwargs):
162154
# fast path for no reindexing necessary
163155
new_obj = obj.copy(deep=copy)
164156
else:
165-
new_obj = obj.reindex(copy=copy, **valid_indexers)
157+
new_obj = obj.reindex(copy=copy, fill_value=fill_value,
158+
**valid_indexers)
166159
new_obj.encoding = obj.encoding
167160
result.append(new_obj)
168161

169162
return tuple(result)
170163

171164

172165
def deep_align(objects, join='inner', copy=True, indexes=None,
173-
exclude=frozenset(), raise_on_invalid=True):
166+
exclude=frozenset(), raise_on_invalid=True,
167+
fill_value=dtypes.NA):
174168
"""Align objects for merging, recursing into dictionary values.
175169
176170
This function is not public API.
@@ -214,7 +208,7 @@ def is_alignable(obj):
214208
out.append(variables)
215209

216210
aligned = align(*targets, join=join, copy=copy, indexes=indexes,
217-
exclude=exclude)
211+
exclude=exclude, fill_value=fill_value)
218212

219213
for position, key, aligned_obj in zip(positions, keys, aligned):
220214
if key is no_key:
@@ -270,6 +264,7 @@ def reindex_variables(
270264
method: Optional[str] = None,
271265
tolerance: Any = None,
272266
copy: bool = True,
267+
fill_value: Optional[Any] = dtypes.NA,
273268
) -> 'Tuple[OrderedDict[Any, Variable], OrderedDict[Any, pd.Index]]':
274269
"""Conform a dictionary of aligned variables onto a new set of variables,
275270
filling in missing values with NaN.
@@ -305,6 +300,8 @@ def reindex_variables(
305300
``copy=False`` and reindexing is unnecessary, or can be performed
306301
with only slice operations, then the output may share memory with
307302
the input. In either case, new xarray objects are always returned.
303+
fill_value : scalar, optional
304+
Value to use for newly missing values
308305
309306
Returns
310307
-------
@@ -380,7 +377,7 @@ def reindex_variables(
380377
needs_masking = any(d in masked_dims for d in var.dims)
381378

382379
if needs_masking:
383-
new_var = var._getitem_with_mask(key)
380+
new_var = var._getitem_with_mask(key, fill_value=fill_value)
384381
elif all(is_full_slice(k) for k in key):
385382
# no reindexing necessary
386383
# here we need to manually deal with copying data, since

xarray/core/dataarray.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -879,9 +879,10 @@ def sel_points(self, dim='points', method=None, tolerance=None,
879879
dim=dim, method=method, tolerance=tolerance, **indexers)
880880
return self._from_temp_dataset(ds)
881881

882-
def reindex_like(self, other, method=None, tolerance=None, copy=True):
883-
"""Conform this object onto the indexes of another object, filling
884-
in missing values with NaN.
882+
def reindex_like(self, other, method=None, tolerance=None, copy=True,
883+
fill_value=dtypes.NA):
884+
"""Conform this object onto the indexes of another object, filling in
885+
missing values with ``fill_value``. The default fill value is NaN.
885886
886887
Parameters
887888
----------
@@ -910,6 +911,8 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True):
910911
``copy=False`` and reindexing is unnecessary, or can be performed
911912
with only slice operations, then the output may share memory with
912913
the input. In either case, a new xarray object is always returned.
914+
fill_value : scalar, optional
915+
Value to use for newly missing values
913916
914917
Returns
915918
-------
@@ -924,12 +927,12 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True):
924927
"""
925928
indexers = reindex_like_indexers(self, other)
926929
return self.reindex(method=method, tolerance=tolerance, copy=copy,
927-
**indexers)
930+
fill_value=fill_value, **indexers)
928931

929932
def reindex(self, indexers=None, method=None, tolerance=None, copy=True,
930-
**indexers_kwargs):
931-
"""Conform this object onto a new set of indexes, filling in
932-
missing values with NaN.
933+
fill_value=dtypes.NA, **indexers_kwargs):
934+
"""Conform this object onto the indexes of another object, filling in
935+
missing values with ``fill_value``. The default fill value is NaN.
933936
934937
Parameters
935938
----------
@@ -956,6 +959,8 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True,
956959
Maximum distance between original and new labels for inexact
957960
matches. The values of the index at the matching locations must
958961
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
962+
fill_value : scalar, optional
963+
Value to use for newly missing values
959964
**indexers_kwarg : {dim: indexer, ...}, optional
960965
The keyword arguments form of ``indexers``.
961966
One of indexers or indexers_kwargs must be provided.
@@ -974,7 +979,8 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True,
974979
indexers = either_dict_or_kwargs(
975980
indexers, indexers_kwargs, 'reindex')
976981
ds = self._to_temp_dataset().reindex(
977-
indexers=indexers, method=method, tolerance=tolerance, copy=copy)
982+
indexers=indexers, method=method, tolerance=tolerance, copy=copy,
983+
fill_value=fill_value)
978984
return self._from_temp_dataset(ds)
979985

980986
def interp(self, coords=None, method='linear', assume_sorted=False,

xarray/core/dataset.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1932,9 +1932,10 @@ def sel_points(self, dim='points', method=None, tolerance=None,
19321932
)
19331933
return self.isel_points(dim=dim, **pos_indexers)
19341934

1935-
def reindex_like(self, other, method=None, tolerance=None, copy=True):
1936-
"""Conform this object onto the indexes of another object, filling
1937-
in missing values with NaN.
1935+
def reindex_like(self, other, method=None, tolerance=None, copy=True,
1936+
fill_value=dtypes.NA):
1937+
"""Conform this object onto the indexes of another object, filling in
1938+
missing values with ``fill_value``. The default fill value is NaN.
19381939
19391940
Parameters
19401941
----------
@@ -1963,6 +1964,8 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True):
19631964
``copy=False`` and reindexing is unnecessary, or can be performed
19641965
with only slice operations, then the output may share memory with
19651966
the input. In either case, a new xarray object is always returned.
1967+
fill_value : scalar, optional
1968+
Value to use for newly missing values
19661969
19671970
Returns
19681971
-------
@@ -1977,12 +1980,12 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True):
19771980
"""
19781981
indexers = alignment.reindex_like_indexers(self, other)
19791982
return self.reindex(indexers=indexers, method=method, copy=copy,
1980-
tolerance=tolerance)
1983+
fill_value=fill_value, tolerance=tolerance)
19811984

19821985
def reindex(self, indexers=None, method=None, tolerance=None, copy=True,
1983-
**indexers_kwargs):
1986+
fill_value=dtypes.NA, **indexers_kwargs):
19841987
"""Conform this object onto a new set of indexes, filling in
1985-
missing values with NaN.
1988+
missing values with ``fill_value``. The default fill value is NaN.
19861989
19871990
Parameters
19881991
----------
@@ -2010,6 +2013,8 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True,
20102013
``copy=False`` and reindexing is unnecessary, or can be performed
20112014
with only slice operations, then the output may share memory with
20122015
the input. In either case, a new xarray object is always returned.
2016+
fill_value : scalar, optional
2017+
Value to use for newly missing values
20132018
**indexers_kwarg : {dim: indexer, ...}, optional
20142019
Keyword arguments in the same form as ``indexers``.
20152020
One of indexers or indexers_kwargs must be provided.
@@ -2034,7 +2039,7 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True,
20342039

20352040
variables, indexes = alignment.reindex_variables(
20362041
self.variables, self.sizes, self.indexes, indexers, method,
2037-
tolerance, copy=copy)
2042+
tolerance, copy=copy, fill_value=fill_value)
20382043
coord_names = set(self._coord_names)
20392044
coord_names.update(indexers)
20402045
return self._replace_with_new_dims(
@@ -2752,7 +2757,7 @@ def update(self, other, inplace=None):
27522757
inplace=inplace)
27532758

27542759
def merge(self, other, inplace=None, overwrite_vars=frozenset(),
2755-
compat='no_conflicts', join='outer'):
2760+
compat='no_conflicts', join='outer', fill_value=dtypes.NA):
27562761
"""Merge the arrays of two datasets into a single dataset.
27572762
27582763
This method generally not allow for overriding data, with the exception
@@ -2790,6 +2795,8 @@ def merge(self, other, inplace=None, overwrite_vars=frozenset(),
27902795
- 'left': use indexes from ``self``
27912796
- 'right': use indexes from ``other``
27922797
- 'exact': error instead of aligning non-equal indexes
2798+
fill_value: scalar, optional
2799+
Value to use for newly missing values
27932800
27942801
Returns
27952802
-------
@@ -2804,7 +2811,7 @@ def merge(self, other, inplace=None, overwrite_vars=frozenset(),
28042811
inplace = _check_inplace(inplace)
28052812
variables, coord_names, dims = dataset_merge_method(
28062813
self, other, overwrite_vars=overwrite_vars, compat=compat,
2807-
join=join)
2814+
join=join, fill_value=fill_value)
28082815

28092816
return self._replace_vars_and_dims(variables, coord_names, dims,
28102817
inplace=inplace)

xarray/core/merge.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import pandas as pd
66

7+
from . import dtypes
78
from .alignment import deep_align
89
from .pycompat import TYPE_CHECKING
910
from .utils import Frozen
@@ -349,7 +350,7 @@ def expand_and_merge_variables(objs, priority_arg=None):
349350

350351

351352
def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
352-
indexes=None):
353+
indexes=None, fill_value=dtypes.NA):
353354
"""Merge coordinate variables.
354355
355356
See merge_core below for argument descriptions. This works similarly to
@@ -358,7 +359,8 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
358359
"""
359360
_assert_compat_valid(compat)
360361
coerced = coerce_pandas_values(objs)
361-
aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
362+
aligned = deep_align(coerced, join=join, copy=False, indexes=indexes,
363+
fill_value=fill_value)
362364
expanded = expand_variable_dicts(aligned)
363365
priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat)
364366
variables = merge_variables(expanded, priority_vars, compat=compat)
@@ -404,7 +406,8 @@ def merge_core(objs,
404406
join='outer',
405407
priority_arg=None,
406408
explicit_coords=None,
407-
indexes=None):
409+
indexes=None,
410+
fill_value=dtypes.NA):
408411
"""Core logic for merging labeled objects.
409412
410413
This is not public API.
@@ -423,6 +426,8 @@ def merge_core(objs,
423426
An explicit list of variables from `objs` that are coordinates.
424427
indexes : dict, optional
425428
Dictionary with values given by pandas.Index objects.
429+
fill_value : scalar, optional
430+
Value to use for newly missing values
426431
427432
Returns
428433
-------
@@ -442,7 +447,8 @@ def merge_core(objs,
442447
_assert_compat_valid(compat)
443448

444449
coerced = coerce_pandas_values(objs)
445-
aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
450+
aligned = deep_align(coerced, join=join, copy=False, indexes=indexes,
451+
fill_value=fill_value)
446452
expanded = expand_variable_dicts(aligned)
447453

448454
coord_names, noncoord_names = determine_coords(coerced)
@@ -470,7 +476,7 @@ def merge_core(objs,
470476
return variables, coord_names, dict(dims)
471477

472478

473-
def merge(objects, compat='no_conflicts', join='outer'):
479+
def merge(objects, compat='no_conflicts', join='outer', fill_value=dtypes.NA):
474480
"""Merge any number of xarray objects into a single Dataset as variables.
475481
476482
Parameters
@@ -492,6 +498,8 @@ def merge(objects, compat='no_conflicts', join='outer'):
492498
of all non-null values.
493499
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
494500
How to combine objects with different indexes.
501+
fill_value : scalar, optional
502+
Value to use for newly missing values
495503
496504
Returns
497505
-------
@@ -529,15 +537,17 @@ def merge(objects, compat='no_conflicts', join='outer'):
529537
obj.to_dataset() if isinstance(obj, DataArray) else obj
530538
for obj in objects]
531539

532-
variables, coord_names, dims = merge_core(dict_like_objects, compat, join)
540+
variables, coord_names, dims = merge_core(dict_like_objects, compat, join,
541+
fill_value=fill_value)
533542
# TODO: don't always recompute indexes
534543
merged = Dataset._construct_direct(
535544
variables, coord_names, dims, indexes=None)
536545

537546
return merged
538547

539548

540-
def dataset_merge_method(dataset, other, overwrite_vars, compat, join):
549+
def dataset_merge_method(dataset, other, overwrite_vars, compat, join,
550+
fill_value=dtypes.NA):
541551
"""Guts of the Dataset.merge method."""
542552

543553
# we are locked into supporting overwrite_vars for the Dataset.merge
@@ -565,7 +575,8 @@ def dataset_merge_method(dataset, other, overwrite_vars, compat, join):
565575
objs = [dataset, other_no_overwrite, other_overwrite]
566576
priority_arg = 2
567577

568-
return merge_core(objs, compat, join, priority_arg=priority_arg)
578+
return merge_core(objs, compat, join, priority_arg=priority_arg,
579+
fill_value=fill_value)
569580

570581

571582
def dataset_update_method(dataset, other):

0 commit comments

Comments
 (0)