sgkit-dev
diff --git a/‎sgkit/stats/pc_relate.py
Lines changed: 103 additions & 0 deletions b/‎sgkit/stats/pc_relate.py
Lines changed: 103 additions & 0 deletions
diff --git a/‎sgkit/testing.py
Lines changed: 13 additions & 0 deletions b/‎sgkit/testing.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎sgkit/tests/test_pc_relate.py
Lines changed: 38 additions & 0 deletions b/‎sgkit/tests/test_pc_relate.py
Lines changed: 38 additions & 0 deletions
@@ -0,0 +1,103 @@
+import dask.array as da
+import xarray as xr
+
+from sgkit.typing import ArrayLike
+
+
+def gramian(a: ArrayLike) -> ArrayLike:
+    """Returns gramian matrix of the given matrix"""
+    return a.T.dot(a)
+
+
+def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
+    """Compute PC-Relate as described in Conomos, et al. 2016 [1].
+
+    Parameters
+    ----------
+    ds : `xr.Dataset`
+        Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
+        * genotype calls: "call_genotype" (SxVxD)
+        * genotype calls mask: "call_genotype_mask" (SxVxD)
+        * sample PCs: "sample_pcs" (PCxS)
+    maf : float
+        individual minor allele frequency filter. If an individual's estimated
+        individual-specific minor allele frequency at a SNP is less than this value,
+        that SNP will be excluded from the analysis for that individual.
+        The default value is 0.01. Must be between (0.0, 0.1).
+
+    Warnings
+    --------
+    This function is only applicable to diploid, biallelic datasets.
+
+    Returns
+    -------
+    Dataset
+        Dataset containing (S = num samples):
+        pc_relate_phi: (S,S) ArrayLike
+            pairwise recent kinship estimation matrix as float in [-0.5, 0.5].
+
+    References
+    ----------
+    - [1] Conomos MP, Reiner AP, Weir BS & Thornton TA. 2016.
+        "Model-free Estimation of Recent Genetic Relatedness."
+        Am. J. Hum. Genet 98, 127–148.
+
+    Raises
+    ------
+    ValueError
+        * If ploidy of provided dataset != 2
+        * If maximum number of alleles in provided dataset != 2
+        * Input dataset is missing any of the required variables
+        * If maf is not in (0.0, 1.0)
+    """
+    if maf <= 0.0 or maf >= 1.0:
+        raise ValueError("MAF must be between (0.0, 1.0)")
+    if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
+        raise ValueError("PC Relate only work for diploid genotypes")
+    if "alleles" in ds.dims and ds.dims["alleles"] != 2:
+        raise ValueError("PC Relate only work for biallelic genotypes")
+    if "call_genotype" not in ds:
+        raise ValueError("Input dataset must contain call_genotype")
+    if "call_genotype_mask" not in ds:
+        raise ValueError("Input dataset must contain call_genotype_mask")
+    if "sample_pcs" not in ds:
+        raise ValueError("Input dataset must contain sample_pcs variable")
+
+    call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
+    call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy"))  # type: ignore[no-untyped-call]
+
+    # impute with variant mean
+    variant_mean = (
+        call_g.where(~call_g_mask)
+        .mean(dim="samples")
+        .expand_dims(dim="samples", axis=1)
+    )
+    imputed_g = da.where(call_g_mask, variant_mean, call_g)
+
+    # 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
+    # is a length D vector of regression coefficients for each of the PCs
+    pcs = ds["sample_pcs"]
+    pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
+    # Note: dask qr decomp requires no chunking in one dimension, and because number of
+    # components should be smaller than number of samples in most cases, we disable
+    # chunking on number components
+    pcsi = pcsi.T.rechunk((None, -1))
+
+    q, r = da.linalg.qr(pcsi)
+    # mu, eq: 3
+    half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_g.T)
+    mu = pcsi.dot(half_beta).T
+    # phi, eq: 4
+    mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
+    mu_mask = da.ma.masked_array(mu, mask=mask)
+    variance = mu_mask.map_blocks(lambda i: i * (1.0 - i))
+    variance = da.ma.filled(variance, fill_value=0.0)
+    stddev = da.sqrt(variance)
+    centered_af = call_g / 2 - mu_mask
+    centered_af = da.ma.filled(centered_af, fill_value=0.0)
+    # NOTE: gramian could be a performance bottleneck, and we could explore
+    #       performance improvements like (or maybe sth else):
+    #       * calculating only the pairs we are interested in
+    #       * using an optimized einsum.
+    phi = gramian(centered_af) / gramian(stddev)
+    return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})
@@ -14,6 +14,7 @@ def simulate_genotype_call_dataset(
     n_allele: int = 2,
     n_contig: int = 1,
     seed: Optional[int] = None,
+    missing_pct: Optional[float] = None,
 ) -> Dataset:
     """Simulate genotype calls and variant/sample data.
 
@@ -41,16 +42,28 @@ def simulate_genotype_call_dataset(
         will all be 0 by default with `n_contig` == 1.
     seed : int, optional
         Seed for random number generation
+    missing_pct: float, optional
+        Donate the percent of missing calls, must be within [0.0, 1.0]
 
     Returns
     -------
     Dataset
         Dataset from `sgkit.create_genotype_call_dataset`.
     """
+    if missing_pct and (missing_pct < 0.0 or missing_pct > 1.0):
+        raise ValueError("missing_pct must be within [0.0, 1.0]")
     rs = np.random.RandomState(seed=seed)
     call_genotype = rs.randint(
         0, n_allele, size=(n_variant, n_sample, n_ploidy), dtype=np.int8
     )
+    if missing_pct:
+        indices = np.random.choice(
+            np.arange(call_genotype.size),
+            replace=False,
+            size=int(call_genotype.size * missing_pct),
+        )
+        call_genotype[np.unravel_index(indices, call_genotype.shape)] = -1
+
     contig_size = split_array_chunks(n_variant, n_contig)
     contig = np.repeat(np.arange(n_contig), contig_size)
     contig_names = np.unique(contig)
 
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+import dask.array as da
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from sgkit.stats.pc_relate import pc_relate
+
+# TODO (rav): finish up tests/validation, clean and split
+
+
+def test_same_as_reference_implementation() -> None:
+    d = Path(__file__).parent.joinpath("test_pc_relate")
+    ds = xr.open_zarr(d.joinpath("zarr_data").as_posix())  # type: ignore[no-untyped-call]
+    pcs = da.from_array(
+        pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy()
+    ).T
+    ds["sample_pcs"] = (("components", "samples"), pcs)
+    phi = pc_relate(ds).compute()["pc_relate_phi"]
+
+    assert isinstance(phi, xr.DataArray)
+    assert phi.shape == (1000, 1000)
+
+    # Get genesis/reference results:
+    genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv"))
+    genesis_phi = genesis_phi[["ID1", "ID2", "kin"]]
+    genesis_phi["ID1"], genesis_phi["ID2"] = genesis_phi.ID1 - 1, genesis_phi.ID2 - 1
+    indices = (genesis_phi["ID1"] * 1000 + genesis_phi["ID2"]).to_numpy()
+    values = genesis_phi["kin"].to_numpy()
+    genesis_phi_full = np.zeros((1000, 1000))
+    np.put(genesis_phi_full, indices, values)
+
+    # Compare with reference/GENESIS:
+    genesis_phi_s = genesis_phi_full[np.triu_indices_from(genesis_phi_full, 1)]
+    phi_s = phi.data[np.triu_indices_from(phi.data, 1)]
+    assert len(phi_s) == len(genesis_phi_s)
+    assert np.allclose(phi_s, genesis_phi_s)