save about 1% by normalising in the xdbl biscalar ladder

GiacomoPope · GiacomoPope · commit f87b420d1f61 · 2025-05-21T17:05:01.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -33,3 +33,8 @@ harness = false
 # name = "bench_product_isogeny"
 # path = "benches/bench_product_isogeny.rs"
 # harness = false
+
+# [[bench]]
+# name = "bench_biscalar_ladder"
+# path = "benches/bench_biscalar_ladder.rs"
+# harness = false
diff --git a/benches/bench_biscalar_ladder.rs b/benches/bench_biscalar_ladder.rs
@@ -0,0 +1,68 @@
+#![allow(non_snake_case)]
+
+mod benchmark_biscalar {
+    use std::time::Duration;
+
+    use criterion::{Criterion, black_box, criterion_group};
+    use fp2::fq::Fq as FqTrait;
+    use rand_core::RngCore;
+
+    use isogeny::elliptic::basis::BasisX;
+    use isogeny::elliptic::curve::Curve;
+    use isogeny::fields::sqisign::SqiField248 as Fp2;
+    use isogeny::utilities::test_utils::drng::DRNG;
+
+    fn benchmark_ladder_biscalar(c: &mut Criterion) {
+        let mut rng = DRNG::from_seed("test_biscalar_ladder".as_bytes());
+
+        let A = Fp2::from_i32(6);
+        let E = Curve::new(&A);
+
+        let mut a: [u8; 32] = [0; 32];
+        let mut b: [u8; 32] = [0; 32];
+        rng.fill_bytes(&mut a);
+        rng.fill_bytes(&mut b);
+
+        // Compute [a]P + [b]Q with projective points
+        let P = E.rand_point(&mut rng);
+        let Q = E.rand_point(&mut rng);
+        let PQ = E.sub(&P, &Q);
+
+        let aP = E.mul(&P, &a, 32 << 3);
+        let bQ = E.mul(&Q, &b, 32 << 3);
+        let aPbQ = E.add(&aP, &bQ);
+
+        // Compute  [a]P + [b]Q with x-only points
+        let xP = P.to_pointx();
+        let xQ = Q.to_pointx();
+        let xPQ = PQ.to_pointx();
+        let basis = BasisX::from_points(&xP, &xQ, &xPQ);
+        let xaPbQ = E.ladder_biscalar(&basis, &a, &b, 32 << 3, 32 << 3);
+
+        // Ensure they're the same.
+        assert!(xaPbQ.equals(&aPbQ.to_pointx()) == u32::MAX);
+
+        let bench_id = format!("Benchmarking biscalar ladder with 128 bit scalar",);
+        c.bench_function(&bench_id, |bb| {
+            bb.iter(|| {
+                black_box(E).ladder_biscalar(
+                    &black_box(basis),
+                    &black_box(a),
+                    &black_box(b),
+                    black_box(32 << 3),
+                    black_box(32 << 3),
+                )
+            })
+        });
+    }
+
+    criterion_group! {
+        name = benchmark_biscalar;
+        config = Criterion::default().measurement_time(Duration::from_secs(10));
+        targets = benchmark_ladder_biscalar,
+    }
+}
+
+fn main() {
+    benchmark_biscalar::benchmark_biscalar();
+}
diff --git a/src/elliptic/x_only_arithmetic.rs b/src/elliptic/x_only_arithmetic.rs
@@ -73,6 +73,23 @@ impl<Fq: FqTrait> Curve<Fq> {
         *ZQ = *XPQ * (V1 - V2).square();
     }
 
+    /// x-only differential addition with PointX type, sets `R` to x(P + Q) given x(P)
+    /// x(Q) and x(P - Q) as `PointX<Fq>`.
+    #[inline]
+    fn xadd_aff_add_into(R: &mut PointX<Fq>, xP: &PointX<Fq>, xQ: &PointX<Fq>, xPmQ: &Fq) {
+        R.X = xQ.X;
+        R.Z = xQ.Z;
+        Self::xadd_aff(&xPmQ, &xP.X, &xP.Z, &mut R.X, &mut R.Z);
+    }
+
+    /// Return x(P + Q) given x(P), x(Q) and x(P - Q) as `PointX<Fq>`.
+    #[inline]
+    fn xdiff_add_add(xP: &PointX<Fq>, xQ: &PointX<Fq>, xPmQ: &Fq) -> PointX<Fq> {
+        let mut R = PointX::INFINITY;
+        Self::xadd_aff_add_into(&mut R, xP, xQ, xPmQ);
+        R
+    }
+
     /// P3 <- n*P, x-only variant.
     /// Integer n is encoded as unsigned little-endian, with length
     /// nbitlen bits. Bits beyond that length are ignored.
@@ -326,11 +343,23 @@ impl<Fq: FqTrait> Curve<Fq> {
         R[2] = T[s1];
 
         // Compute the difference points for T, R
-        let mut D1 = R[1];
-        let mut D2 = R[2];
+        let D1 = R[1];
+        let D2 = R[2];
         R[2] = Self::xdiff_add(&R[1], &R[2], &B.PQ);
-        let mut F1 = R[2];
-        let mut F2 = B.PQ;
+        let F1 = R[2];
+        let F2 = B.PQ;
+
+        // The cost for the main loop is k doubles and 2*k differential adds.
+        // If we normalise D1, D2, F1, F2 then we can save one mul per diff.
+        // add, saving 2*k multiplications in total. As this function is usually
+        // called with scalars of size log(p)/2 > 30, then it's worth normalising
+        // the points.
+        let mut inverses: [Fq; 4] = [D1.Z, D2.Z, F1.Z, F2.Z];
+        Fq::batch_invert(&mut inverses);
+        let mut xD1 = D1.X * inverses[0];
+        let mut xD2 = D2.X * inverses[1];
+        let mut xF1 = F1.X * inverses[2];
+        let mut xF2 = F2.X * inverses[3];
 
         // Main ladder loop, compute [a]P + [b]Q
         for i in (0..k).rev() {
@@ -345,10 +374,10 @@ impl<Fq: FqTrait> Curve<Fq> {
             T[0] = self.xdouble(&T[h >> 1]);
             T[1] = R[r2];
             T[2] = R[r2 + 1];
-            PointX::condswap(&mut D1, &mut D2, (r2 as u32).wrapping_neg());
-            T[1] = Self::xdiff_add(&T[1], &T[2], &D1);
-            T[2] = Self::xdiff_add(&R[0], &R[2], &F1);
-            PointX::condswap(&mut F1, &mut F2, ((h & 1) as u32).wrapping_neg());
+            Fq::condswap(&mut xD1, &mut xD2, (r2 as u32).wrapping_neg());
+            T[1] = Self::xdiff_add_add(&T[1], &T[2], &xD1);
+            T[2] = Self::xdiff_add_add(&R[0], &R[2], &xF1);
+            Fq::condswap(&mut xF1, &mut xF2, ((h & 1) as u32).wrapping_neg());
 
             // Update R values from T values.
             R = T;
diff --git a/src/theta/theta_gluing.rs b/src/theta/theta_gluing.rs
@@ -77,7 +77,8 @@ impl<Fq: FqTrait> EllipticProduct<Fq> {
         // We can do a single inversion during action by translation for a cost of
         // 21M + 4*(2S + 11M) + 1I to compute the coefficients, then the coefficients below
         // cost 44M to compute. If we model 1I to be log(p) multiplications then this
-        // is ~100M for level 1 bringing the total to about 209 M.
+        // is 30M for level 1 bringing the total to about 140 M. For other impl, inversion
+        // could well be more expensive (without the use of binary gcd for inversion in Fp).
         //
         // If we don't do any inversions then four action_by_translation costs 4*(9M + 3S)
         // but then the computation of the coefficients grows. For example, the coefficient