Skip to content

sparse=True option for from_dataframe and from_series #3210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ New functions/methods
By `Nezar Abdennur <https://github.com/nvictus>`_
and `Guido Imperiale <https://github.com/crusaderky>`_.

- :py:meth:`~Dataset.from_dataframe` and :py:meth:`~DataArray.from_series` now
support ``sparse=True`` for converting pandas objects into xarray objects
wrapping sparse arrays. This is particularly useful with sparsely populated
hierarchical indexes. (:issue:`3206`)
By `Stephan Hoyer <https://github.com/shoyer>`_.

- The xarray package is now discoverable by mypy (although typing hints coverage is not
complete yet). mypy type checking is now enforced by CI. Libraries that depend on
xarray and use mypy can now remove from their setup.cfg the lines::
Expand Down
28 changes: 17 additions & 11 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ def reset_coords(
else:
if self.name is None:
raise ValueError(
"cannot reset_coords with drop=False " "on an unnamed DataArrray"
"cannot reset_coords with drop=False on an unnamed DataArrray"
)
dataset[self.name] = self.variable
return dataset
Expand Down Expand Up @@ -1468,9 +1468,7 @@ def expand_dims(
This object, but with an additional dimension(s).
"""
if isinstance(dim, int):
raise TypeError(
"dim should be hashable or sequence/mapping of " "hashables"
)
raise TypeError("dim should be hashable or sequence/mapping of hashables")
elif isinstance(dim, Sequence) and not isinstance(dim, str):
if len(dim) != len(set(dim)):
raise ValueError("dims should not contain duplicate values.")
Expand Down Expand Up @@ -2295,19 +2293,27 @@ def from_dict(cls, d: dict) -> "DataArray":
return obj

@classmethod
def from_series(cls, series: pd.Series) -> "DataArray":
def from_series(cls, series: pd.Series, sparse: bool = False) -> "DataArray":
"""Convert a pandas.Series into an xarray.DataArray.

If the series's index is a MultiIndex, it will be expanded into a
tensor product of one-dimensional coordinates (filling in missing
values with NaN). Thus this operation should be the inverse of the
`to_series` method.

If sparse=True, creates a sparse array instead of a dense NumPy array.
Requires the pydata/sparse package.

See also
--------
xarray.Dataset.from_dataframe
"""
# TODO: add a 'name' parameter
name = series.name
df = pd.DataFrame({name: series})
ds = Dataset.from_dataframe(df)
return ds[name]
temp_name = "__temporary_name"
df = pd.DataFrame({temp_name: series})
ds = Dataset.from_dataframe(df, sparse=sparse)
result = cast(DataArray, ds[temp_name])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I can get rid of the cast by changing Dataset.__getitem__. Let's merge this one first though.

Copy link
Contributor

@crusaderky crusaderky Aug 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... or not. Unlike @functools.single_dispatch, @typing.overload does not allow for a more specific signature followed by a more generic one :(

    @overload
    def __getitem__(self, key: Mapping) -> "Dataset":
        ...

    @overload
    def __getitem__(self, key: Hashable) -> "DataArray":
        ...

    @overload
    def __getitem__(self, key: Any) -> "Dataset":
        ...

    def __getitem__(self, key):

mypy complains:


xarray/core/dataset.py:1218: error: Overloaded function signatures 1 and 2 overlap with incompatible return types
xarray/core/dataset.py:1222: error: Overloaded function signatures 2 and 3 overlap with incompatible return types

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I tried something very similar. This is why I wrote the "TODO" note above mentioning python/mypy#7328.

result.name = series.name
return result

def to_cdms2(self) -> "cdms2_Variable":
"""Convert this array into a cdms2.Variable
Expand Down Expand Up @@ -2722,7 +2728,7 @@ def dot(
"""
if isinstance(other, Dataset):
raise NotImplementedError(
"dot products are not yet supported " "with Dataset objects."
"dot products are not yet supported with Dataset objects."
)
if not isinstance(other, DataArray):
raise TypeError("dot only operates on DataArrays.")
Expand Down
101 changes: 85 additions & 16 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,12 +1205,13 @@ def loc(self) -> _LocIndexer:
"""
return _LocIndexer(self)

def __getitem__(self, key: object) -> "Union[DataArray, Dataset]":
def __getitem__(self, key: Any) -> "Union[DataArray, Dataset]":
"""Access variables or coordinates this dataset as a
:py:class:`~xarray.DataArray`.

Indexing with a list of names will return a new ``Dataset`` object.
"""
# TODO(shoyer): type this properly: https://github.com/python/mypy/issues/7328
if utils.is_dict_like(key):
return self.isel(**cast(Mapping, key))

Expand Down Expand Up @@ -4071,8 +4072,61 @@ def to_dataframe(self):
"""
return self._to_dataframe(self.dims)

def _set_sparse_data_from_dataframe(
self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...]
) -> None:
from sparse import COO

idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
try:
codes = idx.codes
except AttributeError:
# deprecated since pandas 0.24
codes = idx.labels
coords = np.stack([np.asarray(code) for code in codes], axis=0)
is_sorted = idx.is_lexsorted
else:
coords = np.arange(idx.size).reshape(1, -1)
is_sorted = True

for name, series in dataframe.items():
# Cast to a NumPy array first, in case the Series is a pandas
# Extension array (which doesn't have a valid NumPy dtype)
values = np.asarray(series)

# In virtually all real use cases, the sparse array will now have
# missing values and needs a fill_value. For consistency, don't
# special case the rare exceptions (e.g., dtype=int without a
# MultiIndex).
dtype, fill_value = dtypes.maybe_promote(values.dtype)
values = np.asarray(values, dtype=dtype)

data = COO(
coords,
values,
shape,
has_duplicates=False,
sorted=is_sorted,
fill_value=fill_value,
)
self[name] = (dims, data)

def _set_numpy_data_from_dataframe(
self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...]
) -> None:
idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
# expand the DataFrame to include the product of all levels
full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
dataframe = dataframe.reindex(full_idx)

for name, series in dataframe.items():
data = np.asarray(series).reshape(shape)
self[name] = (dims, data)

@classmethod
def from_dataframe(cls, dataframe):
def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset":
"""Convert a pandas.DataFrame into an xarray.Dataset

Each column will be converted into an independent variable in the
Expand All @@ -4081,7 +4135,24 @@ def from_dataframe(cls, dataframe):
values with NaN). This method will produce a Dataset very similar to
that on which the 'to_dataframe' method was called, except with
possibly redundant dimensions (since all dataset variables will have
the same dimensionality).
the same dimensionality)

Parameters
----------
dataframe : pandas.DataFrame
DataFrame from which to copy data and indices.
sparse : bool
If true, create a sparse arrays instead of dense numpy arrays. This
can potentially save a large amount of memory if the DataFrame has
a MultiIndex. Requires the sparse package (sparse.pydata.org).

Returns
-------
New Dataset.

See also
--------
xarray.DataArray.from_series
"""
# TODO: Add an option to remove dimensions along which the variables
# are constant, to enable consistent serialization to/from a dataframe,
Expand All @@ -4094,25 +4165,23 @@ def from_dataframe(cls, dataframe):
obj = cls()

if isinstance(idx, pd.MultiIndex):
# it's a multi-index
# expand the DataFrame to include the product of all levels
full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
dataframe = dataframe.reindex(full_idx)
dims = [
dims = tuple(
name if name is not None else "level_%i" % n
for n, name in enumerate(idx.names)
]
)
for dim, lev in zip(dims, idx.levels):
obj[dim] = (dim, lev)
shape = [lev.size for lev in idx.levels]
shape = tuple(lev.size for lev in idx.levels)
else:
dims = (idx.name if idx.name is not None else "index",)
obj[dims[0]] = (dims, idx)
shape = -1
index_name = idx.name if idx.name is not None else "index"
dims = (index_name,)
obj[index_name] = (dims, idx)
shape = (idx.size,)

for name, series in dataframe.items():
data = np.asarray(series).reshape(shape)
obj[name] = (dims, data)
if sparse:
obj._set_sparse_data_from_dataframe(dataframe, dims, shape)
else:
obj._set_numpy_data_from_dataframe(dataframe, dims, shape)
return obj

def to_dask_dataframe(self, dim_order=None, set_index=False):
Expand Down
1 change: 1 addition & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def LooseVersion(vstring):
has_iris, requires_iris = _importorskip("iris")
has_cfgrib, requires_cfgrib = _importorskip("cfgrib")
has_numbagg, requires_numbagg = _importorskip("numbagg")
has_sparse, requires_sparse = _importorskip("sparse")

# some special cases
has_h5netcdf07, requires_h5netcdf07 = _importorskip("h5netcdf", minversion="0.7")
Expand Down
14 changes: 14 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
requires_np113,
requires_numbagg,
requires_scipy,
requires_sparse,
source_ndarray,
)

Expand Down Expand Up @@ -3374,6 +3375,19 @@ def test_to_and_from_series(self):
expected_da = self.dv.rename(None)
assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"]))

@requires_sparse
def test_from_series_sparse(self):
import sparse

series = pd.Series([1, 2], index=[("a", 1), ("b", 2)])

actual_sparse = DataArray.from_series(series, sparse=True)
actual_dense = DataArray.from_series(series, sparse=False)

assert isinstance(actual_sparse.data, sparse.COO)
actual_sparse.data = actual_sparse.data.todense()
assert_identical(actual_sparse, actual_dense)

def test_to_and_from_empty_series(self):
# GH697
expected = pd.Series([])
Expand Down
23 changes: 23 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
requires_dask,
requires_numbagg,
requires_scipy,
requires_sparse,
source_ndarray,
)

Expand Down Expand Up @@ -3674,6 +3675,28 @@ def test_to_and_from_dataframe(self):
expected = pd.DataFrame([[]], index=idx)
assert expected.equals(actual), (expected, actual)

@requires_sparse
def test_from_dataframe_sparse(self):
import sparse

df_base = pd.DataFrame(
{"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)}
)

ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True)
ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False)
assert isinstance(ds_sparse["y"].data, sparse.COO)
assert isinstance(ds_sparse["z"].data, sparse.COO)
ds_sparse["y"].data = ds_sparse["y"].data.todense()
ds_sparse["z"].data = ds_sparse["z"].data.todense()
assert_identical(ds_dense, ds_sparse)

ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True)
ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False)
assert isinstance(ds_sparse["z"].data, sparse.COO)
ds_sparse["z"].data = ds_sparse["z"].data.todense()
assert_identical(ds_dense, ds_sparse)

def test_to_and_from_empty_dataframe(self):
# GH697
expected = pd.DataFrame({"foo": []})
Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask):


def from_series_or_scalar(se):
try:
if isinstance(se, pd.Series):
return DataArray.from_series(se)
except AttributeError: # scalar case
else: # scalar case
return DataArray(se)


Expand Down