diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index e39750c11..0a63b9a8e 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -42,6 +42,7 @@ from numba.extending import intrinsic from numba import (types, numpy_support, cgutils) from numba.typed import Dict +from numba import prange import sdc import sdc.datatypes.common_functions as common_functions @@ -4638,32 +4639,31 @@ def hpat_pandas_series_cov(self, other, min_periods=None): def hpat_pandas_series_cov_impl(self, other, min_periods=None): - if min_periods is None: - min_periods = 1 - - if len(self._data) == 0 or len(other._data) == 0: - return numpy.nan - - self_arr = self._data[:min(len(self._data), len(other._data))] - other_arr = other._data[:min(len(self._data), len(other._data))] + if min_periods is None or min_periods < 2: + min_periods = 2 - invalid = numpy.isnan(self_arr) | numpy.isnan(other_arr) - if invalid.any(): - self_arr = self_arr[~invalid] - other_arr = other_arr[~invalid] + min_len = min(len(self._data), len(other._data)) - if len(self_arr) < min_periods: + if min_len == 0: return numpy.nan - new_self = pandas.Series(self_arr) - - ma = new_self.mean() - mb = other.mean() - - if numpy.isinf(mb): + other_sum = 0. + self_sum = 0. + self_other_sum = 0. + total_count = 0 + for i in prange(min_len): + s = self._data[i] + o = other._data[i] + if not (numpy.isnan(s) or numpy.isnan(o)): + self_sum += s + other_sum += o + self_other_sum += s*o + total_count += 1 + + if total_count < min_periods: return numpy.nan - return ((self_arr - ma) * (other_arr - mb)).sum() / (new_self.count() - 1.0) + return (self_other_sum - self_sum*other_sum/total_count)/(total_count - 1) return hpat_pandas_series_cov_impl diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 1d90a1f0b..8462c943e 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -5031,9 +5031,10 @@ def test_series_cov_impl(S1, S2, min_periods=None): S1 = pd.Series(input_data1) S2 = pd.Series(input_data2) for period in [None, 2, 1, 8, -4]: - result_ref = test_series_cov_impl(S1, S2, min_periods=period) - result = hpat_func(S1, S2, min_periods=period) - np.testing.assert_allclose(result, result_ref) + with self.subTest(input_data1=input_data1, input_data2=input_data2, min_periods=period): + result_ref = test_series_cov_impl(S1, S2, min_periods=period) + result = hpat_func(S1, S2, min_periods=period) + np.testing.assert_allclose(result, result_ref) @skip_sdc_jit('Series.cov() parameter "min_periods" unsupported') def test_series_cov_unsupported_dtype(self): diff --git a/sdc/tests/tests_perf/test_perf_series.py b/sdc/tests/tests_perf/test_perf_series.py index e2df99998..faac7889a 100644 --- a/sdc/tests/tests_perf/test_perf_series.py +++ b/sdc/tests/tests_perf/test_perf_series.py @@ -220,8 +220,8 @@ def usecase_series_dropna(input_data): finish_time = time.time() return finish_time - start_time, res - - + + def usecase_series_chain_add_and_sum(A, B): start_time = time.time() res = (A + B).sum() @@ -259,6 +259,15 @@ def usecase_series_isna(input_data): return res_time, res +def usecase_series_cov(A, B): + start_time = time.time() + res = A.cov(B) + finish_time = time.time() + res_time = finish_time - start_time + + return res_time, res + + # python -m sdc.runtests sdc.tests.tests_perf.test_perf_series.TestSeriesMethods class TestSeriesMethods(TestBase): @classmethod @@ -292,6 +301,7 @@ def setUpClass(cls): 'series_astype_int': [2 * 10 ** 7], 'series_fillna': [2 * 10 ** 7], 'series_isna': [2 * 10 ** 7], + 'series_cov': [10 ** 8] } def _test_jitted(self, pyfunc, record, *args, **kwargs): @@ -434,3 +444,6 @@ def test_series_float_fillna(self): def test_series_float_isna(self): self._test_case(usecase_series_fillna, 'series_isna') + + def test_series_float_cov(self): + self._test_series_binary_operations(usecase_series_cov, 'series_cov')