From 06f88409783a40ad9dc2fbb337b1d0ac005b2611 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 12:02:38 -0800 Subject: [PATCH 01/20] Added GroupBy tests --- pandas/tests/groupby/test_groupby.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2429e9975fc8e..7b3ab6a7e4a33 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,6 +9,7 @@ from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, concat, Panel, DatetimeIndex, read_csv) +from pandas.core.dtypes.missing import isna from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_frame_equal, assert_index_equal, assert_series_equal, assert_almost_equal) @@ -2116,6 +2117,27 @@ def interweave(list_obj): exp = DataFrame({'key': keys, 'val': _exp_vals}) assert_frame_equal(result, exp) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("vals,exp", [ + (['foo', 'bar', 'baz'], True), (['foo', '', ''], True), + (['', '', ''], False), ([1, 2, 3], True), ([1, 0, 0], True), + ([0, 0, 0], False), ([1., 2., 3.], True), ([1., 0., 0.], True), + ([0., 0., 0.], False), ([True, True, True], True), + ([True, False, False], True), ([False, False, False], False), + ([np.nan, np.nan, np.nan], False) + ]) + def test_groupby_any(self, skipna, vals, exp): + df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + + # edge case for missing data with skipna=False + if not(skipna) and all(isna(vals)): + exp = True + + exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( + ['a', 'b'], name='key')) + result = df.groupby('key').any(skipna=skipna) + assert_frame_equal(result, exp_df) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From c5809cb9c27c21b6ff7d0381fee58464a349ba91 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 12:32:14 -0800 Subject: [PATCH 02/20] Simple stub of group_any --- pandas/_libs/groupby_helper.pxi.in | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index de802f4a72277..0e216b6ce990c 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -791,3 +791,27 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[i, j] = mval {{endfor}} + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_any(ndarray[uint8_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[:, :] values, + ndarray[int64_t] labels, + bint skipna): + cdef: + Py_ssize_t i, N + ndarray[uint8_t] mask + + N, _ = ( labels).shape + + out = np.zeros_like(out) + mask = values[:, 0].astype(np.bool) + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + if mask[lab]: + out[lab, 0] = 1 From 0da6ece4ca7c4e31df2c7bab83503b8051766ef7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 16:37:48 -0800 Subject: [PATCH 03/20] Initial wiring and working func, save all NaNs --- pandas/_libs/groupby_helper.pxi.in | 27 +++++++++++---------------- pandas/core/groupby.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 0e216b6ce990c..b777e2797a3a2 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -794,24 +794,19 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_any(ndarray[uint8_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[:, :] values, +def group_any(ndarray[int64_t] out, + ndarray[int64_t] mask, ndarray[int64_t] labels, bint skipna): cdef: - Py_ssize_t i, N - ndarray[uint8_t] mask - - N, _ = ( labels).shape - - out = np.zeros_like(out) - mask = values[:, 0].astype(np.bool) + Py_ssize_t i, N=len(labels) + int64_t lab - for i in range(N): - lab = labels[i] - if lab < 0: - continue + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - if mask[lab]: - out[lab, 0] = 1 + if mask[i]: + out[lab] = 1 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 00643614e8803..9be68ed3058e2 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1219,6 +1219,26 @@ class GroupBy(_GroupBy): """ _apply_whitelist = _common_apply_whitelist + @Substitution(name='groupby') + @Appender(_doc_template) + def any(self, skipna=True): + """Returns True if any value in the group is truthful, else False""" + obj = self._obj_with_exclusions + + labels, _, _ = self.grouper.group_info + + output = collections.OrderedDict() + for name, obj in self._iterate_slices(): + result = np.zeros(self.ngroups, dtype=np.int64) + if obj.dtype is np.object: + mask = np.array(bool(x) for x in obj.values) + else: + mask = obj.values.astype(np.bool) + libgroupby.group_any(result, mask.astype(np.int64), labels, skipna) + output[name] = result.astype(np.bool) + + return self._wrap_aggregated_output(output) + @Substitution(name='groupby') @Appender(_doc_template) def count(self): From a225c4db84ce3a15b7be24f03238da4feb135243 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 17:05:44 -0800 Subject: [PATCH 04/20] Cleaned up call signature --- pandas/_libs/groupby_helper.pxi.in | 8 +++++++- pandas/core/groupby.py | 10 ++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b777e2797a3a2..cf8f07c360af0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -795,12 +795,18 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) def group_any(ndarray[int64_t] out, - ndarray[int64_t] mask, + ndarray values, ndarray[int64_t] labels, bint skipna): cdef: Py_ssize_t i, N=len(labels) int64_t lab + ndarray[int64_t] mask + + if values.dtype is np.object: + mask = np.array(bool(x) for x in values).astype(np.int64) + else: + mask = values.astype(np.bool).astype(np.int64) with nogil: for i in range(N): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9be68ed3058e2..13a35748e1948 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1223,18 +1223,12 @@ class GroupBy(_GroupBy): @Appender(_doc_template) def any(self, skipna=True): """Returns True if any value in the group is truthful, else False""" - obj = self._obj_with_exclusions - labels, _, _ = self.grouper.group_info - output = collections.OrderedDict() + for name, obj in self._iterate_slices(): result = np.zeros(self.ngroups, dtype=np.int64) - if obj.dtype is np.object: - mask = np.array(bool(x) for x in obj.values) - else: - mask = obj.values.astype(np.bool) - libgroupby.group_any(result, mask.astype(np.int64), labels, skipna) + libgroupby.group_any(result, obj.values, labels, skipna) output[name] = result.astype(np.bool) return self._wrap_aggregated_output(output) From c873e14d015f5bbee91d24812935143eb6730a7b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 17:40:40 -0800 Subject: [PATCH 05/20] Implemented skipna logic --- pandas/_libs/groupby.pyx | 2 ++ pandas/_libs/groupby_helper.pxi.in | 13 ++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3d208a915225..ef1fa84094119 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -20,6 +20,8 @@ from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers +import missing + cdef int64_t iNaT = get_nat() cdef double NaN = np.NaN diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index cf8f07c360af0..9169ea9affe83 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -801,12 +801,15 @@ def group_any(ndarray[int64_t] out, cdef: Py_ssize_t i, N=len(labels) int64_t lab - ndarray[int64_t] mask + ndarray[int64_t] bool_mask + ndarray[uint8_t] isna_mask - if values.dtype is np.object: - mask = np.array(bool(x) for x in values).astype(np.int64) + if values.dtype == 'object': + bool_mask = np.array([bool(x) for x in values]).astype(np.int64) + isna_mask = missing.isnaobj(values).astype(np.uint8) else: - mask = values.astype(np.bool).astype(np.int64) + bool_mask = values.astype(np.bool).astype(np.int64) + isna_mask = np.isnan(values).astype(np.uint8) with nogil: for i in range(N): @@ -814,5 +817,5 @@ def group_any(ndarray[int64_t] out, if lab < 0: continue - if mask[i]: + if bool_mask[i] and not (skipna and isna_mask[i]): out[lab] = 1 From 8aaaf326165e0b807c600f48eed22037e214d159 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 18:03:25 -0800 Subject: [PATCH 06/20] Updated whatsnew --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6865428c352c1..cf84db9b2311a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -700,6 +700,7 @@ Performance Improvements - Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) - Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) +- Improved performance of :func:`GroupBy.any` (:issue:`15435`) .. _whatsnew_0230.docs: From fd5faf950931db60ccb14b348e2a3bedf020a488 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 15 Feb 2018 22:12:11 -0800 Subject: [PATCH 07/20] Updated docstrings --- pandas/_libs/groupby_helper.pxi.in | 16 ++++++++++++++++ pandas/core/groupby.py | 8 +++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 9169ea9affe83..22f3959ccf8f0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -798,6 +798,22 @@ def group_any(ndarray[int64_t] out, ndarray values, ndarray[int64_t] labels, bint skipna): + """Aggregated boolean values to show if any group element is truthful + + Parameters + ---------- + out : array of int64_t values which this method will write its results to + values : array of values to be truth-tested + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + skipna : boolean + Flag to ignore nan values during truth testing + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + The returned values will either be 0 or 1 (False or True, respectively). + """ cdef: Py_ssize_t i, N=len(labels) int64_t lab diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 13a35748e1948..8f20418f69dfd 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1222,7 +1222,13 @@ class GroupBy(_GroupBy): @Substitution(name='groupby') @Appender(_doc_template) def any(self, skipna=True): - """Returns True if any value in the group is truthful, else False""" + """Returns True if any value in the group is truthful, else False + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing + """ labels, _, _ = self.grouper.group_info output = collections.OrderedDict() From 123763756c16857f11030d3cadb815b59a27a341 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 25 Feb 2018 18:50:41 -0800 Subject: [PATCH 08/20] Moved group_any from template to .pyx --- pandas/_libs/groupby.pyx | 45 ++++++++++++++++++++++++++++++ pandas/_libs/groupby_helper.pxi.in | 44 ----------------------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ef1fa84094119..3da8b541d8b4e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -312,5 +312,50 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, filled_vals = 0 +@cython.boundscheck(False) +@cython.wraparound(False) +def group_any(ndarray[int64_t] out, + ndarray values, + ndarray[int64_t] labels, + bint skipna): + """Aggregated boolean values to show if any group element is truthful + + Parameters + ---------- + out : array of int64_t values which this method will write its results to + values : array of values to be truth-tested + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + skipna : boolean + Flag to ignore nan values during truth testing + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + The returned values will either be 0 or 1 (False or True, respectively). + """ + cdef: + Py_ssize_t i, N=len(labels) + int64_t lab + ndarray[int64_t] bool_mask + ndarray[uint8_t] isna_mask + + if values.dtype == 'object': + bool_mask = np.array([bool(x) for x in values]).astype(np.int64) + isna_mask = missing.isnaobj(values).astype(np.uint8) + else: + bool_mask = values.astype(np.bool).astype(np.int64) + isna_mask = np.isnan(values).astype(np.uint8) + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + if bool_mask[i] and not (skipna and isna_mask[i]): + out[lab] = 1 + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 22f3959ccf8f0..de802f4a72277 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -791,47 +791,3 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[i, j] = mval {{endfor}} - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_any(ndarray[int64_t] out, - ndarray values, - ndarray[int64_t] labels, - bint skipna): - """Aggregated boolean values to show if any group element is truthful - - Parameters - ---------- - out : array of int64_t values which this method will write its results to - values : array of values to be truth-tested - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - skipna : boolean - Flag to ignore nan values during truth testing - - Notes - ----- - This method modifies the `out` parameter rather than returning an object. - The returned values will either be 0 or 1 (False or True, respectively). - """ - cdef: - Py_ssize_t i, N=len(labels) - int64_t lab - ndarray[int64_t] bool_mask - ndarray[uint8_t] isna_mask - - if values.dtype == 'object': - bool_mask = np.array([bool(x) for x in values]).astype(np.int64) - isna_mask = missing.isnaobj(values).astype(np.uint8) - else: - bool_mask = values.astype(np.bool).astype(np.int64) - isna_mask = np.isnan(values).astype(np.uint8) - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - if bool_mask[i] and not (skipna and isna_mask[i]): - out[lab] = 1 From 2ad0b50fdc7a9fd51b04e1724b09b8c9909ece10 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 25 Feb 2018 18:51:29 -0800 Subject: [PATCH 09/20] Added tests for all implementation --- pandas/tests/groupby/test_groupby.py | 32 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7b3ab6a7e4a33..1dfb2387e1b44 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2117,25 +2117,33 @@ def interweave(list_obj): exp = DataFrame({'key': keys, 'val': _exp_vals}) assert_frame_equal(result, exp) + @pytest.mark.parametrize("agg_func", ['any', 'all']) @pytest.mark.parametrize("skipna", [True, False]) - @pytest.mark.parametrize("vals,exp", [ - (['foo', 'bar', 'baz'], True), (['foo', '', ''], True), - (['', '', ''], False), ([1, 2, 3], True), ([1, 0, 0], True), - ([0, 0, 0], False), ([1., 2., 3.], True), ([1., 0., 0.], True), - ([0., 0., 0.], False), ([True, True, True], True), - ([True, False, False], True), ([False, False, False], False), - ([np.nan, np.nan, np.nan], False) + @pytest.mark.parametrize("vals", [ + ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], + [1, 2, 3], [1, 0, 0], [0, 0, 0], + [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], + [True, True, True], [True, False, False], [False, False, False], + [np.nan, np.nan, np.nan] ]) - def test_groupby_any(self, skipna, vals, exp): + def test_groupby_bool_aggs(self, agg_func, skipna, vals): df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) - # edge case for missing data with skipna=False - if not(skipna) and all(isna(vals)): - exp = True + if compat.PY3: + import builtins as bltins + else: + import __builtins__ as bltins + + # Figure out expectation using Python builtin + exp = getattr(bltins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func=='any': + exp = False exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( ['a', 'b'], name='key')) - result = df.groupby('key').any(skipna=skipna) + result = getattr(df.groupby('key'), agg_func)(skipna=skipna) assert_frame_equal(result, exp_df) def test_dont_clobber_name_column(self): From b32b906ad1b681b455ce0d27c038e2cdc86f41f7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 25 Feb 2018 18:57:23 -0800 Subject: [PATCH 10/20] Cythonized GroupBy all implementation --- pandas/_libs/groupby.pyx | 54 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3da8b541d8b4e..6c4c140a4250b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -350,12 +350,62 @@ def group_any(ndarray[int64_t] out, with nogil: for i in range(N): lab = labels[i] - if lab < 0: + if lab < 0 or (skipna and isna_mask[i]): continue - if bool_mask[i] and not (skipna and isna_mask[i]): + if bool_mask[i]: out[lab] = 1 +@cython.boundscheck(False) +@cython.wraparound(False) +def group_all(ndarray[int64_t] out, + ndarray values, + ndarray[int64_t] labels, + bint skipna): + """Aggregated boolean values to show if all group elements are truthful + + Parameters + ---------- + out : array of int64_t values which this method will write its results to + values : array of values to be truth-tested + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + skipna : boolean + Flag to ignore nan values during truth testing + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + The returned values will either be 0 or 1 (False or True, respectively). + """ + cdef: + Py_ssize_t i, N=len(labels) + int64_t lab + ndarray[int64_t] bool_mask + ndarray[uint8_t] isna_mask + + if values.dtype == 'object': + bool_mask = np.array([bool(x) for x in values]).astype(np.int64) + isna_mask = missing.isnaobj(values).astype(np.uint8) + else: + bool_mask = values.astype(np.bool).astype(np.int64) + isna_mask = np.isnan(values).astype(np.uint8) + + # Because the 'all' value of an empty iterable in Python is True we can + # start with an array full of ones and set to zero when a False value is + # encountered + out.fill(1) + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0 or (skipna and isna_mask[i]): + continue + + if not bool_mask[i]: + out[lab] = 0 + + # generated from template include "groupby_helper.pxi" From ae9126fa03b61ab526433036a20048fa1294c38c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Feb 2018 10:53:41 -0800 Subject: [PATCH 11/20] Wired any/all into _get_cythonized_result --- pandas/_libs/groupby.pyx | 33 +++++------ pandas/core/groupby.py | 121 +++++++++++++++++++++++++++++++++------ 2 files changed, 119 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6c4c140a4250b..a153b3ab443e4 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -314,18 +314,20 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) -def group_any(ndarray[int64_t] out, - ndarray values, +def group_any(ndarray[uint8_t] out, ndarray[int64_t] labels, + ndarray[uint8_t] values, + ndarray[uint8_t] mask, bint skipna): """Aggregated boolean values to show if any group element is truthful Parameters ---------- - out : array of int64_t values which this method will write its results to - values : array of values to be truth-tested + out : array of values which this method will write its results to labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` + values : array containing the truth value of each element + mask : array indicating whether a value is na or not skipna : boolean Flag to ignore nan values during truth testing @@ -337,40 +339,33 @@ def group_any(ndarray[int64_t] out, cdef: Py_ssize_t i, N=len(labels) int64_t lab - ndarray[int64_t] bool_mask - ndarray[uint8_t] isna_mask - - if values.dtype == 'object': - bool_mask = np.array([bool(x) for x in values]).astype(np.int64) - isna_mask = missing.isnaobj(values).astype(np.uint8) - else: - bool_mask = values.astype(np.bool).astype(np.int64) - isna_mask = np.isnan(values).astype(np.uint8) with nogil: for i in range(N): lab = labels[i] - if lab < 0 or (skipna and isna_mask[i]): + if lab < 0 or (skipna and mask[i]): continue - if bool_mask[i]: + if values[i]: out[lab] = 1 @cython.boundscheck(False) @cython.wraparound(False) -def group_all(ndarray[int64_t] out, - ndarray values, +def group_all(ndarray[uint8_t] out, ndarray[int64_t] labels, + ndarray[uint8_t] values, + ndarray[uint8_t] mask, bint skipna): """Aggregated boolean values to show if all group elements are truthful Parameters ---------- - out : array of int64_t values which this method will write its results to - values : array of values to be truth-tested + out : array of values which this method will write its results to labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` + values : array containing the truth value of each element + mask : array indicating whether a value is na or not skipna : boolean Flag to ignore nan values during truth testing diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8f20418f69dfd..1c643bf5249da 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1219,6 +1219,29 @@ class GroupBy(_GroupBy): """ _apply_whitelist = _common_apply_whitelist + def _bool_agg(self, how, skipna): + """Shared func to call any / all Cython GroupBy implementations""" + + def objs_to_bool(vals): + try: + vals = vals.astype(np.bool) + except ValueError: # for objects + vals = np.array([bool(x) for x in vals]) + + return vals.view(np.uint8) + + def result_to_bool(result): + return result.astype(np.bool, copy=False) + + return self._get_cythonized_result(how, self.grouper, + aggregate=True, + cython_dtype=np.uint8, + needs_values=True, + needs_mask=True, + pre_processing=objs_to_bool, + post_processing=result_to_bool, + skipna=skipna) + @Substitution(name='groupby') @Appender(_doc_template) def any(self, skipna=True): @@ -1229,15 +1252,19 @@ def any(self, skipna=True): skipna : bool, default True Flag to ignore nan values during truth testing """ - labels, _, _ = self.grouper.group_info - output = collections.OrderedDict() + return self._bool_agg('group_any', skipna) - for name, obj in self._iterate_slices(): - result = np.zeros(self.ngroups, dtype=np.int64) - libgroupby.group_any(result, obj.values, labels, skipna) - output[name] = result.astype(np.bool) + @Substitution(name='groupby') + @Appender(_doc_template) + def all(self, skipna=True): + """Returns True if all values in the group are truthful, else False - return self._wrap_aggregated_output(output) + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing + """ + return self._bool_agg('group_all', skipna) @Substitution(name='groupby') @Appender(_doc_template) @@ -1505,6 +1532,8 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result('group_fillna_indexer', self.grouper, needs_mask=True, + cython_dtype=np.int64, + result_is_index=True, direction=direction, limit=limit) @Substitution(name='groupby') @@ -1893,18 +1922,40 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform('cummax', numeric_only=False) - def _get_cythonized_result(self, how, grouper, needs_mask=False, - needs_ngroups=False, **kwargs): + def _get_cythonized_result(self, how, grouper, aggregate=False, + cython_dtype=None, needs_values=False, + needs_mask=False, needs_ngroups=False, + result_is_index=False, + pre_processing=None, post_processing=None, + **kwargs): """Get result for Cythonized functions Parameters ---------- how : str, Cythonized function name to be called grouper : Grouper object containing pertinent group info + aggregate : bool, default False + Whether the result should be aggregated to match the number of + groups + cython_dtype : default None + Type of the array that will be modified by the Cython call. If + `None`, the type will be inferred from the values of each slice + needs_values : bool, default False + Whether the values should be a part of the Cython call + signature needs_mask : bool, default False - Whether boolean mask needs to be part of the Cython call signature + Whether boolean mask needs to be part of the Cython call + signature needs_ngroups : bool, default False - Whether number of groups part of the Cython call signature + Whether number of groups is part of the Cython call signature + result_is_index : bool, default False + Whether the result of the Cython operation is an index of + values to be retrieved, instead of the actual values themselves + pre_processing : function, default None + Function to be applied to `values` prior to passing to Cython + Raises if `needs_values` is False + post_processing : function, default None + Function to be applied to result of Cython function **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -1912,14 +1963,40 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, ------- `Series` or `DataFrame` with filled values """ + if result_is_index and aggregate: + raise ValueError("'result_is_index' and 'aggregate' cannot both " + "be True!") + if post_processing: + if not callable(pre_processing): + raise ValueError("'post_processing' must be a callable!") + if pre_processing: + if not callable(pre_processing): + raise ValueError("'pre_processing' must be a callable!") + if not needs_values: + raise ValueError("Cannot use 'pre_processing' without " + "specifying 'needs_values'!") labels, _, ngroups = grouper.group_info output = collections.OrderedDict() base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): - indexer = np.zeros_like(labels, dtype=np.int64) - func = partial(base_func, indexer, labels) + if aggregate: + result_sz = ngroups + else: + result_sz = len(obj.values) + + if not cython_dtype: + cython_dtype = obj.values.dtype + + result = np.zeros(result_sz, dtype=cython_dtype) + func = partial(base_func, result, labels) + if needs_values: + vals = obj.values + if pre_processing: + vals = pre_processing(vals) + func = partial(func, vals) + if needs_mask: mask = isnull(obj.values).view(np.uint8) func = partial(func, mask) @@ -1928,9 +2005,19 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, func = partial(func, ngroups) func(**kwargs) # Call func to modify indexer values in place - output[name] = algorithms.take_nd(obj.values, indexer) - return self._wrap_transformed_output(output) + if result_is_index: + result = algorithms.take_nd(obj.values, result) + + if post_processing: + result = post_processing(result) + + output[name] = result + + if aggregate: + return self._wrap_aggregated_output(output) + else: + return self._wrap_transformed_output(output) @Substitution(name='groupby') @Appender(_doc_template) @@ -1950,7 +2037,9 @@ def shift(self, periods=1, freq=None, axis=0): return self.apply(lambda x: x.shift(periods, freq, axis)) return self._get_cythonized_result('group_shift_indexer', - self.grouper, needs_ngroups=True, + self.grouper, cython_dtype=np.int64, + needs_ngroups=True, + result_is_index=True, periods=periods) @Substitution(name='groupby') From e1cdb823dacd651009810f83933f56fb1e0e46ca Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Feb 2018 10:56:07 -0800 Subject: [PATCH 12/20] LINT cleanup --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1dfb2387e1b44..76179ca9fa6f9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2138,7 +2138,7 @@ def test_groupby_bool_aggs(self, agg_func, skipna, vals): exp = getattr(bltins, agg_func)(vals) # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func=='any': + if skipna and all(isna(vals)) and agg_func == 'any': exp = False exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( From aa338daa48871859d80269b0a9c32b3536c1c096 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Feb 2018 11:01:20 -0800 Subject: [PATCH 13/20] Cleaned up group_all code --- pandas/_libs/groupby.pyx | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a153b3ab443e4..019af1af0071d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -20,8 +20,6 @@ from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers -import missing - cdef int64_t iNaT = get_nat() cdef double NaN = np.NaN @@ -380,13 +378,6 @@ def group_all(ndarray[uint8_t] out, ndarray[int64_t] bool_mask ndarray[uint8_t] isna_mask - if values.dtype == 'object': - bool_mask = np.array([bool(x) for x in values]).astype(np.int64) - isna_mask = missing.isnaobj(values).astype(np.uint8) - else: - bool_mask = values.astype(np.bool).astype(np.int64) - isna_mask = np.isnan(values).astype(np.uint8) - # Because the 'all' value of an empty iterable in Python is True we can # start with an array full of ones and set to zero when a False value is # encountered @@ -395,10 +386,10 @@ def group_all(ndarray[uint8_t] out, with nogil: for i in range(N): lab = labels[i] - if lab < 0 or (skipna and isna_mask[i]): + if lab < 0 or (skipna and mask[i]): continue - if not bool_mask[i]: + if not values[i]: out[lab] = 0 From 1dc67d95896f49677e24e16fce1ff5a3d3dea2cb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Feb 2018 11:03:54 -0800 Subject: [PATCH 14/20] Added GroupBy.all perf improvement to whatsnew --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cf84db9b2311a..34f15af26f875 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -700,7 +700,7 @@ Performance Improvements - Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) - Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) -- Improved performance of :func:`GroupBy.any` (:issue:`15435`) +- Improved performance of ``GroupBy.any`` and ``GroupBy.all`` (:issue:`15435`) .. _whatsnew_0230.docs: From 977275b2a3bab8f3043813296f0bd4d1e0e6ba1b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Feb 2018 11:45:11 -0800 Subject: [PATCH 15/20] Changed test to use compat.builtins --- pandas/tests/groupby/test_groupby.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 76179ca9fa6f9..0561b3a1d8592 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2129,13 +2129,8 @@ def interweave(list_obj): def test_groupby_bool_aggs(self, agg_func, skipna, vals): df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) - if compat.PY3: - import builtins as bltins - else: - import __builtins__ as bltins - # Figure out expectation using Python builtin - exp = getattr(bltins, agg_func)(vals) + exp = getattr(compat.builtins, agg_func)(vals) # edge case for missing data with skipna and 'any' if skipna and all(isna(vals)) and agg_func == 'any': From 055d8bf4bc4830065b7a226a542907b3665c305d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2018 09:25:11 -0800 Subject: [PATCH 16/20] Consolidated group_any_all Cython func --- pandas/_libs/groupby.pyx | 83 ++++++++++++++-------------------------- pandas/core/groupby.py | 10 ++--- 2 files changed, 33 insertions(+), 60 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 019af1af0071d..3ee2e45869e86 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -312,58 +312,23 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) -def group_any(ndarray[uint8_t] out, - ndarray[int64_t] labels, - ndarray[uint8_t] values, - ndarray[uint8_t] mask, - bint skipna): - """Aggregated boolean values to show if any group element is truthful +def group_any_all(ndarray[uint8_t] out, + ndarray[int64_t] labels, + ndarray[uint8_t] values, + ndarray[uint8_t] mask, + object val_test, + bint skipna): + """Aggregated boolean values to show truthfulness of group elements Parameters ---------- out : array of values which this method will write its results to - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - values : array containing the truth value of each element - mask : array indicating whether a value is na or not - skipna : boolean - Flag to ignore nan values during truth testing - - Notes - ----- - This method modifies the `out` parameter rather than returning an object. - The returned values will either be 0 or 1 (False or True, respectively). - """ - cdef: - Py_ssize_t i, N=len(labels) - int64_t lab - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0 or (skipna and mask[i]): - continue - - if values[i]: - out[lab] = 1 - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_all(ndarray[uint8_t] out, - ndarray[int64_t] labels, - ndarray[uint8_t] values, - ndarray[uint8_t] mask, - bint skipna): - """Aggregated boolean values to show if all group elements are truthful - - Parameters - ---------- - out : array of values which this method will write its results to - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` + labels : array containing unique label for each group, with its + ordering matching up to the corresponding record in `values` values : array containing the truth value of each element mask : array indicating whether a value is na or not + val_test : str {'any', 'all'} + String object dictating whether to use any or all truth testing skipna : boolean Flag to ignore nan values during truth testing @@ -374,14 +339,22 @@ def group_all(ndarray[uint8_t] out, """ cdef: Py_ssize_t i, N=len(labels) - int64_t lab - ndarray[int64_t] bool_mask - ndarray[uint8_t] isna_mask + int64_t lab, flag_val + + if val_test == 'all': + # Because the 'all' value of an empty iterable in Python is True we can + # start with an array full of ones and set to zero when a False value + # is encountered + flag_val = 0 + elif val_test == 'any': + # Because the 'any' value of an empty iterable in Python is False we + # can start with an array full of zeros and set to one only if any + # value encountered is True + flag_val = 1 + else: + raise ValueError("'bool_func' must be either 'any' or 'all'!") - # Because the 'all' value of an empty iterable in Python is True we can - # start with an array full of ones and set to zero when a False value is - # encountered - out.fill(1) + out.fill(1 - flag_val) with nogil: for i in range(N): @@ -389,8 +362,8 @@ def group_all(ndarray[uint8_t] out, if lab < 0 or (skipna and mask[i]): continue - if not values[i]: - out[lab] = 0 + if values[i] == flag_val: + out[lab] = flag_val # generated from template diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1c643bf5249da..b8ca104c4b2c7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1219,7 +1219,7 @@ class GroupBy(_GroupBy): """ _apply_whitelist = _common_apply_whitelist - def _bool_agg(self, how, skipna): + def _bool_agg(self, val_test, skipna): """Shared func to call any / all Cython GroupBy implementations""" def objs_to_bool(vals): @@ -1233,14 +1233,14 @@ def objs_to_bool(vals): def result_to_bool(result): return result.astype(np.bool, copy=False) - return self._get_cythonized_result(how, self.grouper, + return self._get_cythonized_result('group_any_all', self.grouper, aggregate=True, cython_dtype=np.uint8, needs_values=True, needs_mask=True, pre_processing=objs_to_bool, post_processing=result_to_bool, - skipna=skipna) + val_test=val_test, skipna=skipna) @Substitution(name='groupby') @Appender(_doc_template) @@ -1252,7 +1252,7 @@ def any(self, skipna=True): skipna : bool, default True Flag to ignore nan values during truth testing """ - return self._bool_agg('group_any', skipna) + return self._bool_agg('any', skipna) @Substitution(name='groupby') @Appender(_doc_template) @@ -1264,7 +1264,7 @@ def all(self, skipna=True): skipna : bool, default True Flag to ignore nan values during truth testing """ - return self._bool_agg('group_all', skipna) + return self._bool_agg('all', skipna) @Substitution(name='groupby') @Appender(_doc_template) From 0754294b9ffd53009cf03de6ee3e6a0a83853687 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2018 22:23:16 -0800 Subject: [PATCH 17/20] Fixed GroupBy to render func links in whatsnew --- doc/source/api.rst | 5 +++++ doc/source/whatsnew/v0.23.0.txt | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 0e47499a03f3a..a5e26bc948a70 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2179,8 +2179,12 @@ Computations / Descriptive Stats .. autosummary:: :toctree: generated/ + GroupBy.all + GroupBy.any + GroupBy.bfill GroupBy.count GroupBy.cumcount + GroupBy.ffill GroupBy.first GroupBy.head GroupBy.last @@ -2192,6 +2196,7 @@ Computations / Descriptive Stats GroupBy.nth GroupBy.ohlc GroupBy.prod + GroupBy.rank GroupBy.size GroupBy.sem GroupBy.std diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 34f15af26f875..887063d526d16 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -697,10 +697,10 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) -- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) -- Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) -- Improved performance of ``GroupBy.any`` and ``GroupBy.all`` (:issue:`15435`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) .. _whatsnew_0230.docs: From 3cd500fe522ddcddc40dd9eee573c9db88309d25 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2018 22:59:29 -0800 Subject: [PATCH 18/20] Added object benchmark for applicable GroupBy methods --- asv_bench/benchmarks/groupby.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c347442784d41..81d4b9b6d283f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -11,6 +11,13 @@ from .pandas_vb_common import setup # noqa +method_blacklist = { + 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', + 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', + 'var', 'mad', 'describe', 'std'} + } + + class ApplyDictReturn(object): goal_time = 0.2 @@ -153,6 +160,7 @@ def time_frame_nth_any(self, df): def time_frame_nth(self, df): df.groupby(0).nth(0) + def time_series_nth_any(self, df): df[1].groupby(df[0]).nth(0, dropna='any') @@ -369,7 +377,7 @@ class GroupByMethods(object): goal_time = 0.2 param_names = ['dtype', 'method'] - params = [['int', 'float'], + params = [['int', 'float', 'object'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', @@ -377,15 +385,19 @@ class GroupByMethods(object): 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] def setup(self, dtype, method): + if method in method_blacklist.get(dtype, {}): + raise NotImplementedError # skip benchmark ngroups = 1000 size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) if dtype == 'int': key = np.random.randint(0, size, size=size) - else: + elif dtype == 'float': key = np.concatenate([np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]) + elif dtype == 'object': + key = ['foo'] * size df = DataFrame({'values': values, 'key': key}) self.df_groupby_method = getattr(df.groupby('key')['values'], method) From 1bd8dc97943efb4e79eff28e16d8b5d08887e092 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2018 23:28:37 -0800 Subject: [PATCH 19/20] Changed flag_val type to match type of result array --- pandas/_libs/groupby.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3ee2e45869e86..d3fcd84e5f38d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -339,7 +339,8 @@ def group_any_all(ndarray[uint8_t] out, """ cdef: Py_ssize_t i, N=len(labels) - int64_t lab, flag_val + int64_t lab + uint8_t flag_val if val_test == 'all': # Because the 'all' value of an empty iterable in Python is True we can From ee4d0bd8a03539fa77ad9a43556aa979dd19bafd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2018 23:30:34 -0800 Subject: [PATCH 20/20] LINT fix --- asv_bench/benchmarks/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 81d4b9b6d283f..3e7e5c821b14c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -15,7 +15,7 @@ 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', 'var', 'mad', 'describe', 'std'} - } +} class ApplyDictReturn(object):