From 0327bfde2576f41ac24aac4769fd35a422fab3fc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 Jan 2022 15:45:03 -0800 Subject: [PATCH 1/4] BUG: Series[Interval[int]][1] = np.nan incorrect coercion/raising --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/dtypes/cast.py | 13 +++++++++++-- pandas/core/dtypes/dtypes.py | 12 ++++++++++++ pandas/core/indexes/base.py | 13 +++++++++---- pandas/core/internals/blocks.py | 5 ++++- pandas/tests/series/indexing/test_setitem.py | 15 ++++++++++++++- .../tests/series/methods/test_convert_dtypes.py | 10 ++-------- 7 files changed, 53 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b259f182a1197..182760a043dbe 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -210,6 +210,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in :meth:`DataFrame.iloc` where indexing a single row on a :class:`DataFrame` with a single ExtensionDtype column gave a copy instead of a view on the underlying data (:issue:`45241`) +- Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`??`) - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`) - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 141a609e86809..bf11dd3564948 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -470,8 +470,13 @@ def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj: If we have a dtype that cannot hold NA values, find the best match that can. """ if isinstance(dtype, ExtensionDtype): - # TODO: ExtensionDtype.can_hold_na? - return dtype + if dtype._can_hold_na: + return dtype + elif isinstance(dtype, IntervalDtype): + # TODO(GH#45349): don't special-case IntervalDtype, allow + # overriding instead of returning object below. + return IntervalDtype(np.float64, closed=dtype.closed) + return _dtype_obj elif dtype.kind == "b": return _dtype_obj elif dtype.kind in ["i", "u"]: @@ -1470,6 +1475,10 @@ def find_result_type(left: ArrayLike, right: Any) -> DtypeObj: new_dtype = np.result_type(left, right) + elif is_valid_na_for_dtype(right, left.dtype): + # e.g. IntervalDtype[int] and None/np.nan + new_dtype = ensure_dtype_can_hold_na(left.dtype) + else: dtype, _ = infer_dtype_from(right, pandas_dtype=True) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e74d73b84e94b..9a6d4748380a6 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1117,6 +1117,18 @@ def __new__(cls, subtype=None, closed: str_type | None = None): cls._cache_dtypes[key] = u return u + @cache_readonly + def _can_hold_na(self) -> bool: + subtype = self._subtype + if subtype is None: + # partially-initialized + raise NotImplementedError( + "_can_hold_na is not defined for partially-initialized IntervalDtype" + ) + if subtype.kind in ["i", "u"]: + return False + return True + @property def closed(self): return self._closed diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bb262a9ba416a..d7350551e0ae6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -67,6 +67,7 @@ from pandas.core.dtypes.cast import ( can_hold_element, common_dtype_categorical_compat, + ensure_dtype_can_hold_na, find_common_type, infer_dtype_from, maybe_cast_pointwise_result, @@ -175,7 +176,6 @@ from pandas import ( CategoricalIndex, DataFrame, - IntervalIndex, MultiIndex, Series, ) @@ -6024,10 +6024,15 @@ def _find_common_type_compat(self, target) -> DtypeObj: Implementation of find_common_type that adjusts for Index-specific special cases. """ - if is_interval_dtype(self.dtype) and is_valid_na_for_dtype(target, self.dtype): + if is_valid_na_for_dtype(target, self.dtype): # e.g. setting NA value into IntervalArray[int64] - self = cast("IntervalIndex", self) - return IntervalDtype(np.float64, closed=self.closed) + dtype = ensure_dtype_can_hold_na(self.dtype) + if is_dtype_equal(self.dtype, dtype): + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas" + ) + return dtype target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ef1cd92a60540..4e3d7449b76d6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1894,7 +1894,10 @@ def _catch_deprecated_value_error(err: Exception) -> None: if isinstance(err, ValueError): # TODO(2.0): once DTA._validate_setitem_value deprecation # is enforced, stop catching ValueError here altogether - if "Timezones don't match" not in str(err): + if "Cannot set float NaN to integer-backed IntervalArray" in str(err): + # TODO: change the exception raised by IntervalArray to TypeError? + pass + elif "Timezones don't match" not in str(err): raise diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 38952ddfca2bb..d67062f4ea918 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -22,6 +22,7 @@ Timestamp, concat, date_range, + interval_range, period_range, timedelta_range, ) @@ -725,6 +726,17 @@ def test_index_putmask(self, obj, key, expected, val): @pytest.mark.parametrize( "obj,expected,key", [ + pytest.param( + # GH#??? setting a valid NA value into IntervalDtype[int] should + # cast to IntervalDtype[float] + Series(interval_range(1, 5)), + Series( + [Interval(1, 2), np.nan, Interval(3, 4), Interval(4, 5)], + dtype="interval[float64]", + ), + 1, + id="interval_int_na_value", + ), pytest.param( # these induce dtype changes Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), @@ -770,7 +782,8 @@ def test_index_putmask(self, obj, key, expected, val): ], ) class TestSetitemCastingEquivalents(SetitemCastingEquivalents): - @pytest.fixture(params=[np.nan, np.float64("NaN")]) + # TODO: including pd.NA here breaks some tests + @pytest.fixture(params=[np.nan, np.float64("NaN"), None]) def val(self, request): """ One python float NaN, one np.float64. Only np.float64 has a `dtype` diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 1e88ddf3cd943..25cbcf2a84490 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_interval_dtype - import pandas as pd import pandas._testing as tm @@ -203,12 +201,8 @@ def test_convert_dtypes( # Test that it is a copy copy = series.copy(deep=True) - if is_interval_dtype(result.dtype) and result.dtype.subtype.kind in ["i", "u"]: - msg = "Cannot set float NaN to integer-backed IntervalArray" - with pytest.raises(ValueError, match=msg): - result[result.notna()] = np.nan - else: - result[result.notna()] = np.nan + + result[result.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy) From 895d49f9058086bbd57fb6bbc59f9821241a65e2 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 Jan 2022 15:47:14 -0800 Subject: [PATCH 2/4] GH refs --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/series/indexing/test_setitem.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 182760a043dbe..017c43cad0be9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -210,7 +210,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in :meth:`DataFrame.iloc` where indexing a single row on a :class:`DataFrame` with a single ExtensionDtype column gave a copy instead of a view on the underlying data (:issue:`45241`) -- Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`??`) +- Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`45356`) - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`) - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d67062f4ea918..17a413d1d2af1 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -727,7 +727,7 @@ def test_index_putmask(self, obj, key, expected, val): "obj,expected,key", [ pytest.param( - # GH#??? setting a valid NA value into IntervalDtype[int] should + # GH#45356 setting a valid NA value into IntervalDtype[int] should # cast to IntervalDtype[float] Series(interval_range(1, 5)), Series( From 9d64fb5b194680d3c7ea3e4c2c2687c64d652e42 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 22 Jan 2022 21:13:38 -0800 Subject: [PATCH 3/4] fix validation with pd.NA --- pandas/core/arrays/interval.py | 29 ++++++----------------------- pandas/core/internals/blocks.py | 5 +---- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 9ceded00bf159..33732bcaca733 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -18,10 +18,7 @@ from pandas._config import get_option -from pandas._libs import ( - NaT, - lib, -) +from pandas._libs import lib from pandas._libs.interval import ( VALID_CLOSED, Interval, @@ -44,8 +41,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_float_dtype, is_integer_dtype, @@ -54,7 +49,6 @@ is_object_dtype, is_scalar, is_string_dtype, - is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, ) @@ -1103,7 +1097,7 @@ def _validate_scalar(self, value): # TODO: check subdtype match like _validate_setitem_value? elif is_valid_na_for_dtype(value, self.left.dtype): # GH#18295 - left = right = value + left = right = self.left._na_value else: raise TypeError( "can only insert Interval objects and NA into an IntervalArray" @@ -1111,22 +1105,15 @@ def _validate_scalar(self, value): return left, right def _validate_setitem_value(self, value): - needs_float_conversion = False if is_valid_na_for_dtype(value, self.left.dtype): # na value: need special casing to set directly on numpy arrays + value = self.left._na_value if is_integer_dtype(self.dtype.subtype): # can't set NaN on a numpy integer array - needs_float_conversion = True - elif is_datetime64_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.datetime64("NaT") - elif is_datetime64tz_dtype(self.dtype.subtype): - # need proper NaT to set directly on the DatetimeArray array - value = NaT - elif is_timedelta64_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.timedelta64("NaT") + # GH#45484 TypeError, not ValueError, matches what we get with + # non-NA un-holdable value. + raise TypeError("Cannot set float NaN to integer-backed IntervalArray") value_left, value_right = value, value elif isinstance(value, Interval): @@ -1139,10 +1126,6 @@ def _validate_setitem_value(self, value): else: return self._validate_listlike(value) - if needs_float_conversion: - # GH#45484 TypeError, not ValueError, matches what we get with - # non-NA un-holdable value. - raise TypeError("Cannot set float NaN to integer-backed IntervalArray") return value_left, value_right def value_counts(self, dropna: bool = True): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 563fc0eb7025c..3d4f53530b89c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1887,10 +1887,7 @@ def _catch_deprecated_value_error(err: Exception) -> None: if isinstance(err, ValueError): # TODO(2.0): once DTA._validate_setitem_value deprecation # is enforced, stop catching ValueError here altogether - if "Cannot set float NaN to integer-backed IntervalArray" in str(err): - # TODO: change the exception raised by IntervalArray to TypeError? - pass - elif "Timezones don't match" not in str(err): + if "Timezones don't match" not in str(err): raise From c294f996ceae626075deafdaa58a9c679c520811 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 22 Jan 2022 21:15:44 -0800 Subject: [PATCH 4/4] update GH ref --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/series/indexing/test_setitem.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1115f29bfb7c5..9a30fb80db6f9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -216,7 +216,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in :meth:`DataFrame.iloc` where indexing a single row on a :class:`DataFrame` with a single ExtensionDtype column gave a copy instead of a view on the underlying data (:issue:`45241`) -- Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`45356`) +- Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`45568`) - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`) - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - Bug in :meth:`Series.__setitem__` where setting :attr:`NA` into a numeric-dtpye :class:`Series` would incorrectly upcast to object-dtype rather than treating the value as ``np.nan`` (:issue:`44199`) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 7741a9dd506ef..1567da107238d 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -728,7 +728,7 @@ def test_index_putmask(self, obj, key, expected, val): "obj,expected,key", [ pytest.param( - # GH#45356 setting a valid NA value into IntervalDtype[int] should + # GH#45568 setting a valid NA value into IntervalDtype[int] should # cast to IntervalDtype[float] Series(interval_range(1, 5)), Series(