Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Reimplement Series.cov using prange #451

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 20 additions & 20 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from numba.extending import intrinsic
from numba import (types, numpy_support, cgutils)
from numba.typed import Dict
from numba import prange

import sdc
import sdc.datatypes.common_functions as common_functions
Expand Down Expand Up @@ -4638,32 +4639,31 @@ def hpat_pandas_series_cov(self, other, min_periods=None):

def hpat_pandas_series_cov_impl(self, other, min_periods=None):

if min_periods is None:
min_periods = 1

if len(self._data) == 0 or len(other._data) == 0:
return numpy.nan

self_arr = self._data[:min(len(self._data), len(other._data))]
other_arr = other._data[:min(len(self._data), len(other._data))]
if min_periods is None or min_periods < 2:
min_periods = 2

invalid = numpy.isnan(self_arr) | numpy.isnan(other_arr)
if invalid.any():
self_arr = self_arr[~invalid]
other_arr = other_arr[~invalid]
min_len = min(len(self._data), len(other._data))

if len(self_arr) < min_periods:
if min_len == 0:
return numpy.nan

new_self = pandas.Series(self_arr)

ma = new_self.mean()
mb = other.mean()

if numpy.isinf(mb):
other_sum = 0.
self_sum = 0.
self_other_sum = 0.
total_count = 0
for i in prange(min_len):
s = self._data[i]
o = other._data[i]
if not (numpy.isnan(s) or numpy.isnan(o)):
self_sum += s
other_sum += o
self_other_sum += s*o
total_count += 1

if total_count < min_periods:
return numpy.nan

return ((self_arr - ma) * (other_arr - mb)).sum() / (new_self.count() - 1.0)
return (self_other_sum - self_sum*other_sum/total_count)/(total_count - 1)

return hpat_pandas_series_cov_impl

Expand Down
7 changes: 4 additions & 3 deletions sdc/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5031,9 +5031,10 @@ def test_series_cov_impl(S1, S2, min_periods=None):
S1 = pd.Series(input_data1)
S2 = pd.Series(input_data2)
for period in [None, 2, 1, 8, -4]:
result_ref = test_series_cov_impl(S1, S2, min_periods=period)
result = hpat_func(S1, S2, min_periods=period)
np.testing.assert_allclose(result, result_ref)
with self.subTest(input_data1=input_data1, input_data2=input_data2, min_periods=period):
result_ref = test_series_cov_impl(S1, S2, min_periods=period)
result = hpat_func(S1, S2, min_periods=period)
np.testing.assert_allclose(result, result_ref)

@skip_sdc_jit('Series.cov() parameter "min_periods" unsupported')
def test_series_cov_unsupported_dtype(self):
Expand Down
17 changes: 15 additions & 2 deletions sdc/tests/tests_perf/test_perf_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ def usecase_series_dropna(input_data):
finish_time = time.time()

return finish_time - start_time, res


def usecase_series_chain_add_and_sum(A, B):
start_time = time.time()
res = (A + B).sum()
Expand Down Expand Up @@ -259,6 +259,15 @@ def usecase_series_isna(input_data):
return res_time, res


def usecase_series_cov(A, B):
start_time = time.time()
res = A.cov(B)
finish_time = time.time()
res_time = finish_time - start_time

return res_time, res


# python -m sdc.runtests sdc.tests.tests_perf.test_perf_series.TestSeriesMethods
class TestSeriesMethods(TestBase):
@classmethod
Expand Down Expand Up @@ -292,6 +301,7 @@ def setUpClass(cls):
'series_astype_int': [2 * 10 ** 7],
'series_fillna': [2 * 10 ** 7],
'series_isna': [2 * 10 ** 7],
'series_cov': [10 ** 8]
}

def _test_jitted(self, pyfunc, record, *args, **kwargs):
Expand Down Expand Up @@ -434,3 +444,6 @@ def test_series_float_fillna(self):

def test_series_float_isna(self):
self._test_case(usecase_series_fillna, 'series_isna')

def test_series_float_cov(self):
self._test_series_binary_operations(usecase_series_cov, 'series_cov')