Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/transform.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def one_hot_encode(Column input_column, Column categories):
move(c_result.second),
owner=owner,
column_names=[
x if x is not None else 'null' for x in pylist_categories
x if x is not None else '<NA>' for x in pylist_categories
]
)
return encodings
Expand Down
40 changes: 20 additions & 20 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def get_dummies(
cats=None,
sparse=False,
drop_first=False,
dtype="uint8",
dtype="bool",
):
"""Returns a dataframe whose columns are the one hot encodings of all
columns in `df`
Expand Down Expand Up @@ -640,23 +640,23 @@ def get_dummies(
columns. Note this is different from pandas default behavior, which
encodes all columns with dtype object or categorical
dtype : str, optional
Output dtype, default 'uint8'
Output dtype, default 'bool'

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({"a": ["value1", "value2", None], "b": [0, 0, 0]})
>>> cudf.get_dummies(df)
b a_value1 a_value2
0 0 1 0
1 0 0 1
2 0 0 0
0 0 True False
1 0 False True
2 0 False False

>>> cudf.get_dummies(df, dummy_na=True)
b a_None a_value1 a_value2
0 0 0 1 0
1 0 0 0 1
2 0 1 0 0
b a_<NA> a_value1 a_value2
0 0 False True False
1 0 False False True
2 0 True False False

>>> import numpy as np
>>> df = cudf.DataFrame({"a":cudf.Series([1, 2, np.nan, None],
Expand All @@ -669,11 +669,11 @@ def get_dummies(
3 <NA>

>>> cudf.get_dummies(df, dummy_na=True, columns=["a"])
a_1.0 a_2.0 a_nan a_null
0 1 0 0 0
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1
a_<NA> a_1.0 a_2.0 a_nan
0 False True False False
1 False False True False
2 False False False True
3 True False False False

>>> series = cudf.Series([1, 2, None, 2, 4])
>>> series
Expand All @@ -684,12 +684,12 @@ def get_dummies(
4 4
dtype: int64
>>> cudf.get_dummies(series, dummy_na=True)
null 1 2 4
0 0 1 0 0
1 0 0 1 0
2 1 0 0 0
3 0 0 1 0
4 0 0 0 1
<NA> 1 2 4
0 False True False False
1 False False True False
2 True False False False
3 False False True False
4 False False False True
"""

if cats is None:
Expand Down
47 changes: 19 additions & 28 deletions python/cudf/cudf/tests/test_onehot.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

from string import ascii_lowercase

Expand All @@ -23,19 +23,13 @@
(range(10), [1, 2, 3, 4, 5] * 2),
],
)
def test_get_dummies(data, index):
@pytest.mark.parametrize("dtype", ["bool", "uint8"])
def test_get_dummies(data, index, dtype):
gdf = DataFrame({"x": data}, index=index)
pdf = pd.DataFrame({"x": data}, index=index)

encoded_expected = pd.get_dummies(pdf, prefix="test")
encoded_actual = cudf.get_dummies(gdf, prefix="test")

utils.assert_eq(
encoded_expected,
encoded_actual,
check_dtype=len(data) != 0,
)
encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8)
encoded_expected = pd.get_dummies(pdf, prefix="test", dtype=dtype)
encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=dtype)

utils.assert_eq(
encoded_expected,
Expand Down Expand Up @@ -63,16 +57,13 @@ def test_onehot_get_dummies_multicol(n_cols):
@pytest.mark.parametrize("nan_as_null", [True, False])
@pytest.mark.parametrize("dummy_na", [True, False])
def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na):
pdf = pd.DataFrame({"a": [0, 1, np.nan]})
df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null)
df = cudf.DataFrame({"a": [0, 1, np.nan]}, nan_as_null=nan_as_null)
pdf = df.to_pandas(nullable=nan_as_null)

expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"])
got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"])

if dummy_na and nan_as_null:
got = got.rename(columns={"a_null": "a_nan"})[expected.columns]

utils.assert_eq(expected, got)
utils.assert_eq(expected, got, check_like=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -120,12 +111,12 @@ def test_get_dummies_with_nan():
)
expected = cudf.DataFrame(
{
"a_null": [0, 0, 0, 1],
"a_1.0": [1, 0, 0, 0],
"a_2.0": [0, 1, 0, 0],
"a_nan": [0, 0, 1, 0],
"a_<NA>": [False, False, False, True],
"a_1.0": [True, False, False, False],
"a_2.0": [False, True, False, False],
"a_nan": [False, False, True, False],
},
dtype="uint8",
dtype="bool",
)
actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])

Expand Down Expand Up @@ -163,13 +154,13 @@ def test_get_dummies_array_like_with_nan():
ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False)
expected = cudf.DataFrame(
{
"a_null": [0, 0, 0, 1, 0],
"a_0.1": [1, 0, 0, 0, 0],
"a_2.0": [0, 1, 0, 0, 0],
"a_3.0": [0, 0, 1, 0, 0],
"a_nan": [0, 0, 0, 0, 1],
"a_<NA>": [False, False, False, True, False],
"a_0.1": [True, False, False, False, False],
"a_2.0": [False, True, False, False, False],
"a_3.0": [False, False, True, False, False],
"a_nan": [False, False, False, False, True],
},
dtype="uint8",
dtype="bool",
)
actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")

Expand Down