Merge pull request #12 from circstat/omnibus-test-overflow

huangziwei · web-flow · commit 5289403dc0e4 · 2025-04-29T11:15:44.000+02:00
fix: `omnibus_test` pval computation overflow for large n
diff --git a/pycircstat2/hypothesis.py b/pycircstat2/hypothesis.py
@@ -361,14 +361,16 @@ def omnibus_test(
     verbose: bool = False,
 ) -> tuple[float, float]:
     """
-    A simple alternative to the Rayleigh test, aka Hodges-Ajne test,
-    which does not assume sampling from a specific distribution. This
-    is called an "omnibus test" because it works well for unimodal,
-    bimodal, and multimodal distributions (for ungrouped data).
+    Hodges–Ajne omnibus test for circular uniformity.
 
     - H0: The population is uniformly distributed around the circle
     - H1: The population is not uniformly distributed.
 
+    This test is distribution-free and handles uni-, bi-, and multimodal
+    alternatives.  The classical p-value involves factorials and
+    overflows for large *n*.  We therefore compute it in log-space
+    (``math.lgamma``) and exponentiate at the very end.
+
     Parameters
     ----------
     alpha: np.array or None
@@ -403,12 +405,41 @@ def omnibus_test(
         lines_rotated > 0.0, lines_rotated < np.round(np.pi, 5)
     ).sum(1)
     m = int(np.min(right))
-    pval = (
-        (n - 2 * m)
-        * math.factorial(n)
-        / (math.factorial(m) * math.factorial(n - m))
-        / 2 ** (n - 1)
+
+    # ------------------------------------------------------------------
+    # 2. p-value   ———  analytical formula and its log form
+    # ------------------------------------------------------------------
+    #     Classical (Zar 2010, eq. 27-4):
+    #
+    #         p  =  (n − 2m) · n! / [ m! · (n − m)! · 2^(n−1) ]            …(1)
+    #       # pval = (
+    #       #    (n - 2 * m)
+    #       #    * math.factorial(n)
+    #       #    / (math.factorial(m) * math.factorial(n - m))
+    #       #    / 2 ** (n - 1)
+    #       # ) # eq(27.7)
+
+    #     Taking natural logs and using  Γ(k+1) = k!  with  log Γ = lgamma:
+    #
+    #         ln p  =  ln(n − 2m)
+    #                 + lgamma(n + 1)
+    #                 − lgamma(m + 1)
+    #                 − lgamma(n − m + 1)
+    #                 − (n − 1)·ln 2                                        …(2)
+    #
+    #     Eq. (2) is numerically safe for very large n; we exponentiate at
+    #     the end, knowing the result may under-flow to 0.0 in double precision.
+    # ------------------------------------------------------------------
+
+    logp = (
+        math.log(n - 2*m)                     
+        + math.lgamma(n + 1)                 
+        - math.lgamma(m + 1)                  
+        - math.lgamma(n - m + 1)              
+        - (n - 1)*math.log(2.0)               
     )
+    pval = np.exp(logp)
+    
     A = np.pi * np.sqrt(n) / (2 * (n - 2 * m))
 
     if verbose:
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pycircstat2"
-version = "0.1.12"
+version = "0.1.13"
 description = "Circular statistcs with Python."
 authors = [
 ]
diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py
@@ -104,6 +104,18 @@ def test_omnibus_test():
 
     np.testing.assert_approx_equal(pval, 0.0043, significant=2)
 
+    # test large sample size 
+    # (factorial division overflow while computing p-val)
+    # fixed in PR 12
+    from pycircstat2.distributions import circularuniform, vonmises
+    d0 = vonmises.rvs(mu=0, kappa=1, size=10_000)
+    d1 = circularuniform.rvs(size=10_000)
+
+    _, pval = omnibus_test(alpha=d0)
+    assert pval < 0.05, "Expected significant p-value for von Mises distribution"
+    _, pval = omnibus_test(alpha=d1)
+    assert pval > 0.05, "Expected non-significant p-value for uniform distribution"
+
 
 def test_batschelet_test():
     data_zar_ex5_ch27 = load_data("D8", source="zar")
@@ -752,4 +764,4 @@ def test_equal_median_small_sample():
 
     result = common_median_test([alpha1, alpha2])
     assert result["reject"] is np.False_
-    assert not np.isnan(result["common_median"])
+    assert not np.isnan(result["common_median"])

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "pycircstat2"`
`7`		`-version = "0.1.12"`
	`7`	`+version = "0.1.13"`
`8`	`8`	`description = "Circular statistcs with Python."`
`9`	`9`	`authors = [`
`10`	`10`	`]`