diff --git a/doc/source/release.rst b/doc/source/release.rst index 47a2ef82c78dc..dcbf8b8c7f271 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -190,6 +190,7 @@ Improvements to existing features - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`) - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) - Testing statements updated to use specialized asserts (:issue:`6175`) +- ``DataFrame.rank()`` now has a percentage rank option (:issue:`5971`) - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) - ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 14c9ec2f3355d..27e25c3954dad 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -283,7 +283,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, def rank_2d_float64(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep'): + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -296,6 +296,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 + float count = 0.0 tiebreak = tiebreakers[ties_method] @@ -335,6 +336,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 total_tie_count = 0 + count = 0.0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -342,6 +344,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', if val == nan_value and keep_na: ranks[i, argsorted[i, j]] = nan continue + count += 1.0 if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): @@ -363,7 +366,8 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 - + if pct: + ranks[i, :] /= count if axis == 0: return ranks.T else: @@ -371,7 +375,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', def rank_2d_int64(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep'): + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -384,6 +388,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', int64_t val float64_t sum_ranks = 0 int tiebreak = 0 + float count = 0.0 tiebreak = tiebreakers[ties_method] if axis == 0: @@ -411,10 +416,12 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 total_tie_count = 0 + count = 0.0 for j in range(k): sum_ranks += j + 1 dups += 1 val = values[i, j] + count += 1.0 if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): @@ -436,7 +443,8 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 - + if pct: + ranks[i, :] /= count if axis == 0: return ranks.T else: @@ -528,7 +536,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: - ranks / count + return ranks / count else: return ranks @@ -562,7 +570,7 @@ class NegInfinity(object): __cmp__ = _return_true def rank_2d_generic(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep'): + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -577,6 +585,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 + float count = 0.0 tiebreak = tiebreakers[ties_method] @@ -611,7 +620,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for i in range(len(values)): ranks[i] = rank_1d_generic(in_arr[i], ties_method=ties_method, - ascending=ascending) + ascending=ascending, + pct=pct) if axis == 0: return ranks.T else: @@ -626,12 +636,14 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = infs = 0 total_tie_count = 0 + count = 0.0 for j in range(k): val = values[i, j] if val is nan_value and keep_na: ranks[i, argsorted[i, j]] = nan infs += 1 continue + count += 1.0 sum_ranks += (j - infs) + 1 dups += 1 if j == k - 1 or are_diff(values[i, j + 1], val): @@ -652,7 +664,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 - + if pct: + ranks[i, :] /= count if axis == 0: return ranks.T else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f20c316393244..e2ef178c62e71 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -277,7 +277,7 @@ def rank(values, axis=0, method='average', na_option='keep', elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) ranks = f(values, axis=axis, ties_method=method, - ascending=ascending, na_option=na_option) + ascending=ascending, na_option=na_option, pct=pct) return ranks diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 430b309260f8c..5ecdd4d8b351d 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4182,7 +4182,7 @@ def f(arr): return data.apply(f, axis=axis) def rank(self, axis=0, numeric_only=None, method='average', - na_option='keep', ascending=True): + na_option='keep', ascending=True, pct=False): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values @@ -4205,6 +4205,8 @@ def rank(self, axis=0, numeric_only=None, method='average', * bottom: smallest rank if descending ascending : boolean, default True False for ranks by high (1) to low (N) + pct : boolean, default False + Computes percentage rank of data Returns ------- @@ -4214,18 +4216,18 @@ def rank(self, axis=0, numeric_only=None, method='average', if numeric_only is None: try: ranks = algos.rank(self.values, axis=axis, method=method, - ascending=ascending, na_option=na_option) + ascending=ascending, na_option=na_option, + pct=pct) return self._constructor(ranks, index=self.index, columns=self.columns) except TypeError: numeric_only = True - if numeric_only: data = self._get_numeric_data() else: data = self ranks = algos.rank(data.values, axis=axis, method=method, - ascending=ascending, na_option=na_option) + ascending=ascending, na_option=na_option, pct=pct) return self._constructor(ranks, index=data.index, columns=data.columns) def to_timestamp(self, freq=None, how='start', axis=0, copy=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index c3300e7b35a8b..47721ab371c3b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1732,7 +1732,7 @@ def rank(self, method='average', na_option='keep', ascending=True, keep: leave NA values where they are ascending : boolean, default True False for ranks by high (1) to low (N) - pct : boolean, defeault False + pct : boolean, default False Computes percentage rank of data Returns diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d93232c18ee31..a7270dc4517b7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10933,12 +10933,24 @@ def test_rank(self): def test_rank2(self): from datetime import datetime + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 + result = df.rank(1, pct=True) + assert_frame_equal(result, expected) + + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = df.rank(0) / 2.0 + result = df.rank(0, pct=True) + assert_frame_equal(result, expected) + + df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) assert_frame_equal(result, expected) + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) assert_frame_equal(result, expected)