From df3c432a5201f0857aea0c94ef3f2963754b89d8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 27 Dec 2021 17:54:36 -0800 Subject: [PATCH 1/3] BUG: Unstack/pivot raising ValueError on large result --- doc/source/whatsnew/v1.4.0.rst | 25 ++++++++++++++++++++++--- pandas/core/reshape/reshape.py | 15 +++++++++++---- pandas/tests/reshape/test_pivot.py | 7 +++++-- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c743e38a118f7..f7425aa5d0764 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -364,10 +364,29 @@ second column is instead renamed to ``a.2``. res -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: +.. _whatsnew_140.notable_bug_fixes.unstack_pivot_int32_limit: -notable_bug_fix3 -^^^^^^^^^^^^^^^^ +unstack or pivot no longer raises ValueError for result that would exceed int32 limit +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack` would raise a ``ValueError`` if the operation +could produce a result with more than ``2**31 - 1`` elements. This operation now raises a :class:`errors.PerformanceWarning` +instead (:issue:`26314`). + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df = DataFrame({"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0}) + In [4]: df.pivot_table(index="ind1", columns="ind2", values="count", aggfunc="count") + ValueError: Unstacked DataFrame is too big, causing int32 overflow + +*New behavior*: + +.. code-block:: python + + In [4]: df.pivot_table(index="ind1", columns="ind2", values="count", aggfunc="count") + PerformanceWarning: The following operation may generate 4294967296 cells in the resulting pandas object. .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c2cd73584b7da..a570af1f949d7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -2,6 +2,7 @@ import itertools from typing import TYPE_CHECKING +import warnings import numpy as np @@ -11,6 +12,7 @@ Dtype, npt, ) +from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -125,10 +127,15 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None): num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. - num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) - - if num_rows > 0 and num_columns > 0 and num_cells <= 0: - raise ValueError("Unstacked DataFrame is too big, causing int32 overflow") + num_cells = num_rows * num_columns + + # GH 26314: Previous ValueError raised was too restrictive for many users. + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + ) self._make_selectors() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 88607f4b036a0..0556c6cfe3334 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( Categorical, @@ -1991,12 +1993,13 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 + # GH 26314: Change ValueError to PerformanceWarning df = DataFrame( {"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0} ) - msg = "Unstacked DataFrame is too big, causing int32 overflow" - with pytest.raises(ValueError, match=msg): + msg = "The following operation may generate" + with tm.assert_produces_warning(PerformanceWarning, match=msg): df.pivot_table( index="ind1", columns="ind2", values="count", aggfunc="count" ) From 6a81ad47bc56c4e3a6564c3dc350d7f120838edb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 27 Dec 2021 17:56:16 -0800 Subject: [PATCH 2/3] fix whatsnew header --- doc/source/whatsnew/v1.4.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f7425aa5d0764..df3ec3b9da8b1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -366,8 +366,8 @@ second column is instead renamed to ``a.2``. .. _whatsnew_140.notable_bug_fixes.unstack_pivot_int32_limit: -unstack or pivot no longer raises ValueError for result that would exceed int32 limit -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +unstack and pivot_table no longer raises ValueError for result that would exceed int32 limit +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack` would raise a ``ValueError`` if the operation could produce a result with more than ``2**31 - 1`` elements. This operation now raises a :class:`errors.PerformanceWarning` From f26198dfa09cf61d4800638942461ca4e3547316 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 27 Dec 2021 19:19:54 -0800 Subject: [PATCH 3/3] Ignore memoryerrors as we're checking the warning --- pandas/tests/frame/test_stack_unstack.py | 12 ++++++++++-- pandas/tests/reshape/test_pivot.py | 10 +++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 689c54b03b507..4b3ddbc6c193c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( DataFrame, @@ -1819,11 +1821,17 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH#20601 + # GH 26314: Change ValueError to PerformanceWarning df = DataFrame( np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] ) - with pytest.raises(ValueError, match="int32 overflow"): - df.unstack() + msg = "The following operation may generate" + with tm.assert_produces_warning(PerformanceWarning, match=msg): + try: + df.unstack() + except MemoryError: + # Just checking the warning + return def test_stack_order_with_unsorted_levels(self): # GH#16323 diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 0556c6cfe3334..9bfda33956287 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2000,9 +2000,13 @@ def test_pivot_number_of_levels_larger_than_int32(self): msg = "The following operation may generate" with tm.assert_produces_warning(PerformanceWarning, match=msg): - df.pivot_table( - index="ind1", columns="ind2", values="count", aggfunc="count" - ) + try: + df.pivot_table( + index="ind1", columns="ind2", values="count", aggfunc="count" + ) + except MemoryError: + # Just checking the warning + return def test_pivot_table_aggfunc_dropna(self, dropna): # GH 22159