From c16676e3b16af91c7c191d776ed41f81d9de91d3 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Mon, 23 Jun 2025 15:23:54 +0100 Subject: [PATCH] ML-KEM: Add AArch64 arithmetic backend Context: The ML-KEM implementation in AWS-LC is imported from mlkem-native. mlkem-native comes in a "C-only" version, but also offers AArch64 and x86_64 backends for (a) arithmetic, and (b) FIPS-202. Currently, only the "C-only" version is imported into AWS-LC. This commit adds a custom AArch64 backend to AWS-LC. The backend is essentially the same as in mlkem-native, but its assembly sources are taken from s2n-bignum and its headers are written from scratch. The constant tables used in the backend are copied from mlkem-native. Compared to extending the mlkem-native->AWS-LC importer to include mlkem-native's AArch64 backend, this approach sticks to s2n-bignum as the sole source of verified assembly. It also provides greater flexibility in maintaining and adjusting the backend, both the assembly and the headers. For example, the assembly may be optimized for Graviton cores in the future, or the dispatch in the metadata files adjusted; the latter will mostly be relevant as we integrate x86_64 assembly, for which we aim to use the same methodology. To avoid a symbol clash with s2n-bignum, the mlkem-native namespace is changed from `mlkem` to `mlkem_native`. s2n-bignum is partially re-imported from the development branch https://github.com/jargh/s2n-bignum-dev/tree/mlkem/, restricting to the ML-KEM related files. Signed-off-by: Hanno Becker --- crypto/fipsmodule/CMakeLists.txt | 14 + crypto/fipsmodule/ml_kem/aarch64/README.md | 1 + crypto/fipsmodule/ml_kem/aarch64/constants.c | 668 ++++++++++++++++++ crypto/fipsmodule/ml_kem/aarch64/meta.h | 81 +++ crypto/fipsmodule/ml_kem/ml_kem.c | 35 +- .../fipsmodule/ml_kem/mlkem_native_config.h | 8 +- third_party/s2n-bignum/META.yml | 8 +- .../s2n-bignum-imported/arm/mlkem/Makefile | 39 + .../arm/mlkem/mlkem_basemul_k2.S | 210 ++++++ .../arm/mlkem/mlkem_basemul_k3.S | 264 +++++++ .../arm/mlkem/mlkem_basemul_k4.S | 318 +++++++++ .../arm/mlkem/mlkem_intt.S | 412 +++++++++++ .../arm/mlkem/mlkem_mulcache_compute.S | 67 ++ .../s2n-bignum-imported/arm/mlkem/mlkem_ntt.S | 363 ++++++++++ .../arm/mlkem/mlkem_poly_reduce.S | 104 +++ .../arm/mlkem/mlkem_poly_tobytes.S | 115 +++ .../arm/mlkem/mlkem_poly_tomont.S | 85 +++ .../mlkem/mlkem_rej_uniform_VARIABLE_TIME.S | 208 ++++++ .../s2n-bignum-imported/include/s2n-bignum.h | 40 ++ 19 files changed, 3020 insertions(+), 20 deletions(-) create mode 100644 crypto/fipsmodule/ml_kem/aarch64/README.md create mode 100644 crypto/fipsmodule/ml_kem/aarch64/constants.c create mode 100644 crypto/fipsmodule/ml_kem/aarch64/meta.h create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S create mode 100644 third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 47b00c7f8f..049122668d 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -291,6 +291,20 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR ${S2N_BIGNUM_DIR}/generic/bignum_copy_row_from_table_16.S ${S2N_BIGNUM_DIR}/generic/bignum_copy_row_from_table_32.S ) + + # ML-KEM core arithmetic + list(APPEND BCM_ASM_SOURCES + ${S2N_BIGNUM_DIR}/mlkem/mlkem_basemul_k2.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_basemul_k3.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_basemul_k4.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_intt.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_mulcache_compute.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_ntt.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_poly_reduce.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_poly_tobytes.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_poly_tomont.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S) + endif() if(BORINGSSL_PREFIX) diff --git a/crypto/fipsmodule/ml_kem/aarch64/README.md b/crypto/fipsmodule/ml_kem/aarch64/README.md new file mode 100644 index 0000000000..c3b66b1973 --- /dev/null +++ b/crypto/fipsmodule/ml_kem/aarch64/README.md @@ -0,0 +1 @@ +This directory contains an AArch64 arithmetic backend for mlkem-native. The core assembly routines are imported from [s2n-bignum](https://github.com/awslabs/s2n-bignum/). \ No newline at end of file diff --git a/crypto/fipsmodule/ml_kem/aarch64/constants.c b/crypto/fipsmodule/ml_kem/aarch64/constants.c new file mode 100644 index 0000000000..53d0098bf6 --- /dev/null +++ b/crypto/fipsmodule/ml_kem/aarch64/constants.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../internal.h" + +alignas(16) const int16_t mlk_aarch64_ntt_zetas_layer12345[] = { + -1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201, + -1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914, + -882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529, + 0, 0, 193, 1900, -283, -2786, 56, 551, 0, 0, + 797, 7845, -1089, -10719, 1333, 13121, 0, 0, -543, -5345, + 1426, 14036, -1235, -12156, 0, 0, -69, -679, 535, 5266, + -447, -4400, 0, 0, 569, 5601, -936, -9213, -450, -4429, + 0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0, +}; + +alignas(16) const int16_t mlk_aarch64_ntt_zetas_layer67[] = { + 289, 289, 331, 331, -76, -76, -1573, -1573, 2845, + 2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17, + 583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739, + 5739, 16113, 16113, -10247, -10247, -568, -568, -680, -680, + 723, 723, 1100, 1100, -5591, -5591, -6693, -6693, 7117, + 7117, 10828, 10828, 1197, 1197, -1025, -1025, -1052, -1052, + -1274, -1274, 11782, 11782, -10089, -10089, -10355, -10355, -12540, + -12540, 1409, 1409, -48, -48, 756, 756, -314, -314, + 13869, 13869, -472, -472, 7441, 7441, -3091, -3091, -667, + -667, 233, 233, -1173, -1173, -279, -279, -6565, -6565, + 2293, 2293, -11546, -11546, -2746, -2746, 650, 650, -1352, + -1352, -816, -816, 632, 632, 6398, 6398, -13308, -13308, + -8032, -8032, 6221, 6221, -1626, -1626, -540, -540, -1482, + -1482, 1461, 1461, -16005, -16005, -5315, -5315, -14588, -14588, + 14381, 14381, 1651, 1651, -1540, -1540, 952, 952, -642, + -642, 16251, 16251, -15159, -15159, 9371, 9371, -6319, -6319, + -464, -464, 33, 33, 1320, 1320, -1414, -1414, -4567, + -4567, 325, 325, 12993, 12993, -13918, -13918, 939, 939, + -892, -892, 733, 733, 268, 268, 9243, 9243, -8780, + -8780, 7215, 7215, 2638, 2638, -1021, -1021, -941, -941, + -992, -992, 641, 641, -10050, -10050, -9262, -9262, -9764, + -9764, 6309, 6309, -1010, -1010, 1435, 1435, 807, 807, + 452, 452, -9942, -9942, 14125, 14125, 7943, 7943, 4449, + 4449, 1584, 1584, -1292, -1292, 375, 375, -1239, -1239, + 15592, 15592, -12717, -12717, 3691, 3691, -12196, -12196, -1031, + -1031, -109, -109, -780, -780, 1645, 1645, -10148, -10148, + -1073, -1073, -7678, -7678, 16192, 16192, 1438, 1438, -461, + -461, 1534, 1534, -927, -927, 14155, 14155, -4538, -4538, + 15099, 15099, -9125, -9125, 1063, 1063, -556, -556, -1230, + -1230, -863, -863, 10463, 10463, -5473, -5473, -12107, -12107, + -8495, -8495, 319, 319, 757, 757, 561, 561, -735, + -735, 3140, 3140, 7451, 7451, 5522, 5522, -7235, -7235, + -682, -682, -712, -712, 1481, 1481, 648, 648, -6713, + -6713, -7008, -7008, 14578, 14578, 6378, 6378, -525, -525, + 403, 403, 1143, 1143, -554, -554, -5168, -5168, 3967, + 3967, 11251, 11251, -5453, -5453, 1092, 1092, 1026, 1026, + -1179, -1179, 886, 886, 10749, 10749, 10099, 10099, -11605, + -11605, 8721, 8721, -855, -855, -219, -219, 1227, 1227, + 910, 910, -8416, -8416, -2156, -2156, 12078, 12078, 8957, + 8957, -1607, -1607, -1455, -1455, -1219, -1219, 885, 885, + -15818, -15818, -14322, -14322, -11999, -11999, 8711, 8711, 1212, + 1212, 1029, 1029, -394, -394, -1175, -1175, 11930, 11930, + 10129, 10129, -3878, -3878, -11566, -11566, +}; + +alignas(16) const int16_t mlk_aarch64_invntt_zetas_layer12345[] = { + 1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601, + 450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400, + -535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036, + 0, 0, -797, -7845, -1333, -13121, 1089, 10719, 0, 0, + -193, -1900, -56, -551, 283, 2786, 0, 0, 1410, 13879, + -1476, -14529, -1339, -13180, 0, 0, -1062, -10453, 882, 8682, + -296, -2914, 0, 0, 1600, 15749, 40, 394, 749, 7373, + -848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0, +}; + +alignas(16) const int16_t mlk_aarch64_invntt_zetas_layer67[] = { + -910, -910, -1227, -1227, 219, 219, 855, 855, -8957, + -8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175, + 394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878, + 3878, -10129, -10129, -11930, -11930, -885, -885, 1219, 1219, + 1455, 1455, 1607, 1607, -8711, -8711, 11999, 11999, 14322, + 14322, 15818, 15818, -648, -648, -1481, -1481, 712, 712, + 682, 682, -6378, -6378, -14578, -14578, 7008, 7008, 6713, + 6713, -886, -886, 1179, 1179, -1026, -1026, -1092, -1092, + -8721, -8721, 11605, 11605, -10099, -10099, -10749, -10749, 554, + 554, -1143, -1143, -403, -403, 525, 525, 5453, 5453, + -11251, -11251, -3967, -3967, 5168, 5168, 927, 927, -1534, + -1534, 461, 461, -1438, -1438, 9125, 9125, -15099, -15099, + 4538, 4538, -14155, -14155, 735, 735, -561, -561, -757, + -757, -319, -319, 7235, 7235, -5522, -5522, -7451, -7451, + -3140, -3140, 863, 863, 1230, 1230, 556, 556, -1063, + -1063, 8495, 8495, 12107, 12107, 5473, 5473, -10463, -10463, + -452, -452, -807, -807, -1435, -1435, 1010, 1010, -4449, + -4449, -7943, -7943, -14125, -14125, 9942, 9942, -1645, -1645, + 780, 780, 109, 109, 1031, 1031, -16192, -16192, 7678, + 7678, 1073, 1073, 10148, 10148, 1239, 1239, -375, -375, + 1292, 1292, -1584, -1584, 12196, 12196, -3691, -3691, 12717, + 12717, -15592, -15592, 1414, 1414, -1320, -1320, -33, -33, + 464, 464, 13918, 13918, -12993, -12993, -325, -325, 4567, + 4567, -641, -641, 992, 992, 941, 941, 1021, 1021, + -6309, -6309, 9764, 9764, 9262, 9262, 10050, 10050, -268, + -268, -733, -733, 892, 892, -939, -939, -2638, -2638, + -7215, -7215, 8780, 8780, -9243, -9243, -632, -632, 816, + 816, 1352, 1352, -650, -650, -6221, -6221, 8032, 8032, + 13308, 13308, -6398, -6398, 642, 642, -952, -952, 1540, + 1540, -1651, -1651, 6319, 6319, -9371, -9371, 15159, 15159, + -16251, -16251, -1461, -1461, 1482, 1482, 540, 540, 1626, + 1626, -14381, -14381, 14588, 14588, 5315, 5315, 16005, 16005, + 1274, 1274, 1052, 1052, 1025, 1025, -1197, -1197, 12540, + 12540, 10355, 10355, 10089, 10089, -11782, -11782, 279, 279, + 1173, 1173, -233, -233, 667, 667, 2746, 2746, 11546, + 11546, -2293, -2293, 6565, 6565, 314, 314, -756, -756, + 48, 48, -1409, -1409, 3091, 3091, -7441, -7441, 472, + 472, -13869, -13869, 1573, 1573, 76, 76, -331, -331, + -289, -289, 15483, 15483, 748, 748, -3258, -3258, -2845, + -2845, -1100, -1100, -723, -723, 680, 680, 568, 568, + -10828, -10828, -7117, -7117, 6693, 6693, 5591, 5591, 1041, + 1041, -1637, -1637, -583, -583, -17, -17, 10247, 10247, + -16113, -16113, -5739, -5739, -167, -167, +}; + +alignas(16) const int16_t mlk_aarch64_zetas_mulcache_native[] = { + 17, -17, -568, 568, 583, -583, -680, 680, 1637, -1637, 723, + -723, -1041, 1041, 1100, -1100, 1409, -1409, -667, 667, -48, 48, + 233, -233, 756, -756, -1173, 1173, -314, 314, -279, 279, -1626, + 1626, 1651, -1651, -540, 540, -1540, 1540, -1482, 1482, 952, -952, + 1461, -1461, -642, 642, 939, -939, -1021, 1021, -892, 892, -941, + 941, 733, -733, -992, 992, 268, -268, 641, -641, 1584, -1584, + -1031, 1031, -1292, 1292, -109, 109, 375, -375, -780, 780, -1239, + 1239, 1645, -1645, 1063, -1063, 319, -319, -556, 556, 757, -757, + -1230, 1230, 561, -561, -863, 863, -735, 735, -525, 525, 1092, + -1092, 403, -403, 1026, -1026, 1143, -1143, -1179, 1179, -554, 554, + 886, -886, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -1219, + 1219, -394, 394, 885, -885, -1175, 1175, +}; + +alignas(16) const int16_t mlk_aarch64_zetas_mulcache_twisted_native[] = { + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 16113, + -16113, 7117, -7117, -10247, 10247, 10828, -10828, 13869, -13869, + -6565, 6565, -472, 472, 2293, -2293, 7441, -7441, -11546, + 11546, -3091, 3091, -2746, 2746, -16005, 16005, 16251, -16251, + -5315, 5315, -15159, 15159, -14588, 14588, 9371, -9371, 14381, + -14381, -6319, 6319, 9243, -9243, -10050, 10050, -8780, 8780, + -9262, 9262, 7215, -7215, -9764, 9764, 2638, -2638, 6309, + -6309, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, + 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 10463, + -10463, 3140, -3140, -5473, 5473, 7451, -7451, -12107, 12107, + 5522, -5522, -8495, 8495, -7235, 7235, -5168, 5168, 10749, + -10749, 3967, -3967, 10099, -10099, 11251, -11251, -11605, 11605, + -5453, 5453, 8721, -8721, -15818, 15818, 11930, -11930, -14322, + 14322, 10129, -10129, -11999, 11999, -3878, 3878, 8711, -8711, + -11566, 11566, +}; + +alignas(16) const uint8_t mlk_rej_uniform_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 0 */, + 0, 1, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 1 */, + 2, 3, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 2 */, + 0, 1, 2, 3, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 3 */, + 4, 5, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 4 */, + 0, 1, 4, 5, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 5 */, + 2, 3, 4, 5, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 6 */, + 0, 1, 2, 3, 4, 5, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 7 */, + 6, 7, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 8 */, + 0, 1, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 9 */, + 2, 3, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 10 */, + 0, 1, 2, 3, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 11 */, + 4, 5, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 12 */, + 0, 1, 4, 5, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 13 */, + 2, 3, 4, 5, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 14 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 255, 255, 255, 255, 255, 255, 255, 255 /* 15 */, + 8, 9, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 16 */, + 0, 1, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 17 */, + 2, 3, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 18 */, + 0, 1, 2, 3, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 19 */, + 4, 5, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 20 */, + 0, 1, 4, 5, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 21 */, + 2, 3, 4, 5, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 22 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 23 */, + 6, 7, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 24 */, + 0, 1, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 25 */, + 2, 3, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 26 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 27 */, + 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 28 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 29 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 30 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 255, 255, 255, 255, 255, 255 /* 31 */, + 10, 11, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 32 */, + 0, 1, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 33 */, + 2, 3, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 34 */, + 0, 1, 2, 3, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 35 */, + 4, 5, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 36 */, + 0, 1, 4, 5, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 37 */, + 2, 3, 4, 5, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 38 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 39 */, + 6, 7, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 40 */, + 0, 1, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 41 */, + 2, 3, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 42 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 43 */, + 4, 5, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 44 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 45 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 46 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 255, 255, 255, 255, 255, 255 /* 47 */, + 8, 9, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 48 */, + 0, 1, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 49 */, + 2, 3, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 50 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 51 */, + 4, 5, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 52 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 53 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 54 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 55 */, + 6, 7, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 56 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 57 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 58 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 59 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 60 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 61 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 62 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 255, 255, 255, 255 /* 63 */, + 12, 13, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 64 */, + 0, 1, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 65 */, + 2, 3, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 66 */, + 0, 1, 2, 3, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 67 */, + 4, 5, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 68 */, + 0, 1, 4, 5, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 69 */, + 2, 3, 4, 5, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 70 */, + 0, 1, 2, 3, 4, 5, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 71 */, + 6, 7, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 72 */, + 0, 1, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 73 */, + 2, 3, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 74 */, + 0, 1, 2, 3, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 75 */, + 4, 5, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 76 */, + 0, 1, 4, 5, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 77 */, + 2, 3, 4, 5, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 78 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 255, 255, 255, 255, 255, 255 /* 79 */, + 8, 9, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 80 */, + 0, 1, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 81 */, + 2, 3, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 82 */, + 0, 1, 2, 3, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 83 */, + 4, 5, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 84 */, + 0, 1, 4, 5, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 85 */, + 2, 3, 4, 5, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 86 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 87 */, + 6, 7, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 88 */, + 0, 1, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 89 */, + 2, 3, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 90 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 91 */, + 4, 5, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 92 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 93 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 94 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 255, 255, 255, 255 /* 95 */, + 10, 11, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 96 */, + 0, 1, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 97 */, + 2, 3, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 98 */, + 0, 1, 2, 3, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 99 */, + 4, 5, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 100 */, + 0, 1, 4, 5, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 101 */, + 2, 3, 4, 5, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 102 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 103 */, + 6, 7, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 104 */, + 0, 1, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 105 */, + 2, 3, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 106 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 107 */, + 4, 5, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 108 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 109 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 110 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 12, 13, 255, 255, 255, 255 /* 111 */, + 8, 9, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 112 */, + 0, 1, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 113 */, + 2, 3, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 114 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 115 */, + 4, 5, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 116 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 117 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 118 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 119 */, + 6, 7, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 120 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 121 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 122 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 123 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 124 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 125 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 126 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 255, 255 /* 127 */, + 14, 15, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 128 */, + 0, 1, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 129 */, + 2, 3, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 130 */, + 0, 1, 2, 3, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 131 */, + 4, 5, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 132 */, + 0, 1, 4, 5, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 133 */, + 2, 3, 4, 5, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 134 */, + 0, 1, 2, 3, 4, 5, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 135 */, + 6, 7, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 136 */, + 0, 1, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 137 */, + 2, 3, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 138 */, + 0, 1, 2, 3, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 139 */, + 4, 5, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 140 */, + 0, 1, 4, 5, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 141 */, + 2, 3, 4, 5, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 142 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 14, 15, 255, 255, 255, 255, 255, 255 /* 143 */, + 8, 9, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 144 */, + 0, 1, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 145 */, + 2, 3, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 146 */, + 0, 1, 2, 3, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 147 */, + 4, 5, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 148 */, + 0, 1, 4, 5, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 149 */, + 2, 3, 4, 5, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 150 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 151 */, + 6, 7, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 152 */, + 0, 1, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 153 */, + 2, 3, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 154 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 155 */, + 4, 5, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 156 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 157 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 158 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 14, 15, 255, 255, 255, 255 /* 159 */, + 10, 11, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 160 */, + 0, 1, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 161 */, + 2, 3, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 162 */, + 0, 1, 2, 3, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 163 */, + 4, 5, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 164 */, + 0, 1, 4, 5, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 165 */, + 2, 3, 4, 5, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 166 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 167 */, + 6, 7, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 168 */, + 0, 1, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 169 */, + 2, 3, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 170 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 171 */, + 4, 5, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 172 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 173 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 174 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 14, 15, 255, 255, 255, 255 /* 175 */, + 8, 9, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 176 */, + 0, 1, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 177 */, + 2, 3, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 178 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 179 */, + 4, 5, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 180 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 181 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 182 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 183 */, + 6, 7, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 184 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 185 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 186 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 187 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 188 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 189 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 190 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 255, 255 /* 191 */, + 12, 13, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 192 */, + 0, 1, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 193 */, + 2, 3, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 194 */, + 0, 1, 2, 3, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 195 */, + 4, 5, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 196 */, + 0, 1, 4, 5, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 197 */, + 2, 3, 4, 5, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 198 */, + 0, 1, 2, 3, 4, 5, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 199 */, + 6, 7, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 200 */, + 0, 1, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 201 */, + 2, 3, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 202 */, + 0, 1, 2, 3, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 203 */, + 4, 5, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 204 */, + 0, 1, 4, 5, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 205 */, + 2, 3, 4, 5, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 206 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 255, 255, 255, 255 /* 207 */, + 8, 9, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 208 */, + 0, 1, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 209 */, + 2, 3, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 210 */, + 0, 1, 2, 3, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 211 */, + 4, 5, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 212 */, + 0, 1, 4, 5, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 213 */, + 2, 3, 4, 5, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 214 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 215 */, + 6, 7, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 216 */, + 0, 1, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 217 */, + 2, 3, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 218 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 219 */, + 4, 5, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 220 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 221 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 222 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 14, 15, 255, 255 /* 223 */, + 10, 11, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 224 */, + 0, 1, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 225 */, + 2, 3, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 226 */, + 0, 1, 2, 3, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 227 */, + 4, 5, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 228 */, + 0, 1, 4, 5, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 229 */, + 2, 3, 4, 5, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 230 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 231 */, + 6, 7, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 232 */, + 0, 1, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 233 */, + 2, 3, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 234 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 235 */, + 4, 5, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 236 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 237 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 238 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 12, 13, 14, 15, 255, 255 /* 239 */, + 8, 9, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 240 */, + 0, 1, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 241 */, + 2, 3, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 242 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 243 */, + 4, 5, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 244 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 245 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 246 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 247 */, + 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 248 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 249 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 250 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 251 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 252 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 253 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 254 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 /* 255 */, +}; diff --git a/crypto/fipsmodule/ml_kem/aarch64/meta.h b/crypto/fipsmodule/ml_kem/aarch64/meta.h new file mode 100644 index 0000000000..0da4388fc4 --- /dev/null +++ b/crypto/fipsmodule/ml_kem/aarch64/meta.h @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +#ifndef ML_KEM_AARCH64_BACKEND_H +#define ML_KEM_AARCH64_BACKEND_H + +#include "../mlkem/common.h" + +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT +#define MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE +#define MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED +#define MLK_USE_NATIVE_POLY_TOBYTES +#define MLK_USE_NATIVE_REJ_UNIFORM + +extern const int16_t mlk_aarch64_ntt_zetas_layer12345[]; +extern const int16_t mlk_aarch64_ntt_zetas_layer67[]; +extern const int16_t mlk_aarch64_invntt_zetas_layer12345[]; +extern const int16_t mlk_aarch64_invntt_zetas_layer67[]; +extern const uint8_t mlk_rej_uniform_table[]; +extern const int16_t mlk_aarch64_zetas_mulcache_native[]; +extern const int16_t mlk_aarch64_zetas_mulcache_twisted_native[]; + +#include "../../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" + +static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N]) { + mlkem_ntt(data, mlk_aarch64_ntt_zetas_layer12345, mlk_aarch64_ntt_zetas_layer67); +} + +static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N]) { + mlkem_intt(data, mlk_aarch64_invntt_zetas_layer12345, mlk_aarch64_invntt_zetas_layer67); +} + +static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + mlkem_poly_reduce(data); +} + +static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + mlkem_poly_tomont(data); +} + +static MLK_INLINE void mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N]) { + mlkem_mulcache_compute(x, y, mlk_aarch64_zetas_mulcache_native, + mlk_aarch64_zetas_mulcache_twisted_native); +} + +static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native( + int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N], + const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)]) { + mlkem_basemul_k2(r, a, b, b_cache); +} + +static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native( + int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N], + const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)]) { + mlkem_basemul_k3(r, a, b, b_cache); +} + +static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native( + int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N], + const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)]) { + mlkem_basemul_k4(r, a, b, b_cache); +} + +static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]) { + mlkem_poly_tobytes(r, a); +} + +static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) { + if (len != MLKEM_N || buflen % 24 != 0) { + return -1; + } + return (int) mlkem_rej_uniform_VARIABLE_TIME(r, buf, buflen, mlk_rej_uniform_table); +} + +#endif /* ML_KEM_AARCH64_BACKEND_H */ diff --git a/crypto/fipsmodule/ml_kem/ml_kem.c b/crypto/fipsmodule/ml_kem/ml_kem.c index e2227e0465..ce67d700a2 100644 --- a/crypto/fipsmodule/ml_kem/ml_kem.c +++ b/crypto/fipsmodule/ml_kem/ml_kem.c @@ -26,6 +26,11 @@ #include "./ml_kem.h" +// AArch64 backend +#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_NO_ASM) +#include "aarch64/constants.c" +#endif + typedef struct { uint8_t *buffer; size_t *length; @@ -92,7 +97,7 @@ int ml_kem_512_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */ if (!check_buffer(pkey) || !check_buffer(skey)) { return 1; } - const int res = mlkem512_keypair_derand(pkey.buffer, skey.buffer, seed); + const int res = mlkem_native512_keypair_derand(pkey.buffer, skey.buffer, seed); #if defined(AWSLC_FIPS) /* PCT failure is the only failure condition for key generation. */ if (res != 0) { @@ -110,7 +115,7 @@ int ml_kem_512_keypair(uint8_t *public_key /* OUT */, size_t *secret_len /* IN_OUT */) { output_buffer pkey = {public_key, public_len, MLKEM512_PUBLIC_KEY_BYTES}; output_buffer skey = {secret_key, secret_len, MLKEM512_SECRET_KEY_BYTES}; - return ml_kem_common_keypair(mlkem512_keypair, pkey, skey); + return ml_kem_common_keypair(mlkem_native512_keypair, pkey, skey); } int ml_kem_512_encapsulate_deterministic(uint8_t *ciphertext /* OUT */, @@ -131,7 +136,7 @@ int ml_kem_512_encapsulate_deterministic_no_self_test(uint8_t *ciphertext const uint8_t *seed /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM512_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate_deterministic(mlkem512_enc_derand, ctext, ss, public_key, seed); + return ml_kem_common_encapsulate_deterministic(mlkem_native512_enc_derand, ctext, ss, public_key, seed); } int ml_kem_512_encapsulate(uint8_t *ciphertext /* OUT */, @@ -141,7 +146,7 @@ int ml_kem_512_encapsulate(uint8_t *ciphertext /* OUT */, const uint8_t *public_key /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM512_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate(mlkem512_enc, ctext, ss, public_key); + return ml_kem_common_encapsulate(mlkem_native512_enc, ctext, ss, public_key); } int ml_kem_512_decapsulate(uint8_t *shared_secret /* OUT */, @@ -157,7 +162,7 @@ int ml_kem_512_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */, const uint8_t *ciphertext /* IN */, const uint8_t *secret_key /* IN */) { output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN}; - return ml_kem_common_decapsulate(mlkem512_dec, ss, ciphertext, secret_key); + return ml_kem_common_decapsulate(mlkem_native512_dec, ss, ciphertext, secret_key); } @@ -181,7 +186,7 @@ int ml_kem_768_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */, if (!check_buffer(pkey) || !check_buffer(skey)) { return 1; } - const int res = mlkem768_keypair_derand(pkey.buffer, skey.buffer, seed); + const int res = mlkem_native768_keypair_derand(pkey.buffer, skey.buffer, seed); #if defined(AWSLC_FIPS) /* PCT failure is the only failure condition for key generation. */ if (res != 0) { @@ -199,7 +204,7 @@ int ml_kem_768_keypair(uint8_t *public_key /* OUT */, size_t *secret_len /* IN_OUT */) { output_buffer pkey = {public_key, public_len, MLKEM768_PUBLIC_KEY_BYTES}; output_buffer skey = {secret_key, secret_len, MLKEM768_SECRET_KEY_BYTES}; - return ml_kem_common_keypair(mlkem768_keypair, pkey, skey); + return ml_kem_common_keypair(mlkem_native768_keypair, pkey, skey); } int ml_kem_768_encapsulate_deterministic(uint8_t *ciphertext /* OUT */, @@ -220,7 +225,7 @@ int ml_kem_768_encapsulate_deterministic_no_self_test(uint8_t *ciphertext const uint8_t *seed /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM768_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate_deterministic(mlkem768_enc_derand, ctext, ss, public_key, seed); + return ml_kem_common_encapsulate_deterministic(mlkem_native768_enc_derand, ctext, ss, public_key, seed); } int ml_kem_768_encapsulate(uint8_t *ciphertext /* OUT */, @@ -230,7 +235,7 @@ int ml_kem_768_encapsulate(uint8_t *ciphertext /* OUT */, const uint8_t *public_key /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM768_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate(mlkem768_enc, ctext, ss, public_key); + return ml_kem_common_encapsulate(mlkem_native768_enc, ctext, ss, public_key); } int ml_kem_768_decapsulate(uint8_t *shared_secret /* OUT */, @@ -246,7 +251,7 @@ int ml_kem_768_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */, const uint8_t *ciphertext /* IN */, const uint8_t *secret_key /* IN */) { output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN}; - return ml_kem_common_decapsulate(mlkem768_dec, ss, ciphertext, secret_key); + return ml_kem_common_decapsulate(mlkem_native768_dec, ss, ciphertext, secret_key); } int ml_kem_1024_keypair_deterministic(uint8_t *public_key /* OUT */, @@ -268,7 +273,7 @@ int ml_kem_1024_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */ if (!check_buffer(pkey) || !check_buffer(skey)) { return 1; } - const int res = mlkem1024_keypair_derand(pkey.buffer, skey.buffer, seed); + const int res = mlkem_native1024_keypair_derand(pkey.buffer, skey.buffer, seed); #if defined(AWSLC_FIPS) /* PCT failure is the only failure condition for key generation. */ if (res != 0) { @@ -286,7 +291,7 @@ int ml_kem_1024_keypair(uint8_t *public_key /* OUT */, size_t *secret_len /* IN_OUT */) { output_buffer pkey = {public_key, public_len, MLKEM1024_PUBLIC_KEY_BYTES}; output_buffer skey = {secret_key, secret_len, MLKEM1024_SECRET_KEY_BYTES}; - return ml_kem_common_keypair(mlkem1024_keypair, pkey, skey); + return ml_kem_common_keypair(mlkem_native1024_keypair, pkey, skey); } int ml_kem_1024_encapsulate_deterministic(uint8_t *ciphertext /* OUT */, @@ -307,7 +312,7 @@ int ml_kem_1024_encapsulate_deterministic_no_self_test(uint8_t *ciphertext const uint8_t *seed /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM1024_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate_deterministic(mlkem1024_enc_derand, ctext, ss, public_key, seed); + return ml_kem_common_encapsulate_deterministic(mlkem_native1024_enc_derand, ctext, ss, public_key, seed); } int ml_kem_1024_encapsulate(uint8_t *ciphertext /* OUT */, @@ -317,7 +322,7 @@ int ml_kem_1024_encapsulate(uint8_t *ciphertext /* OUT */, const uint8_t *public_key /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM1024_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate(mlkem1024_enc, ctext, ss, public_key); + return ml_kem_common_encapsulate(mlkem_native1024_enc, ctext, ss, public_key); } int ml_kem_1024_decapsulate(uint8_t *shared_secret /* OUT */, @@ -333,7 +338,7 @@ int ml_kem_1024_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */, const uint8_t *ciphertext /* IN */, const uint8_t *secret_key /* IN */) { output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN}; - return ml_kem_common_decapsulate(mlkem1024_dec, ss, ciphertext, secret_key); + return ml_kem_common_decapsulate(mlkem_native1024_dec, ss, ciphertext, secret_key); } int ml_kem_common_keypair(int (*keypair)(uint8_t * public_key, uint8_t *secret_key), diff --git a/crypto/fipsmodule/ml_kem/mlkem_native_config.h b/crypto/fipsmodule/ml_kem/mlkem_native_config.h index aa4f24f721..19e4541293 100644 --- a/crypto/fipsmodule/ml_kem/mlkem_native_config.h +++ b/crypto/fipsmodule/ml_kem/mlkem_native_config.h @@ -9,7 +9,7 @@ // Namespacing: All symbols are of the form mlkem*. Level-specific // symbols are further prefixed with their security level, e.g. // mlkem512*, mlkem768*, mlkem1024*. -#define MLK_CONFIG_NAMESPACE_PREFIX mlkem +#define MLK_CONFIG_NAMESPACE_PREFIX mlkem_native // Replace mlkem-native's FIPS 202 headers with glue code to // AWS-LC's own FIPS 202 implementation. @@ -68,4 +68,10 @@ static MLK_INLINE void mlk_randombytes(void *ptr, size_t len) { #define MLK_CONFIG_NO_ASM #endif +#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH +#define MLK_CONFIG_ARITH_BACKEND_FILE "../aarch64/meta.h" +#endif + #endif // MLkEM_NATIVE_CONFIG_H diff --git a/third_party/s2n-bignum/META.yml b/third_party/s2n-bignum/META.yml index 4949b2bf60..3687f1ad97 100644 --- a/third_party/s2n-bignum/META.yml +++ b/third_party/s2n-bignum/META.yml @@ -1,5 +1,5 @@ name: s2n-bignum-imported -source: awslabs/s2n-bignum.git -commit: 54e1fa5756d6b13961c2f61d90f75426aa25d373 -target: main -imported-at: 2025-04-28T17:22:07+0000 +source: jargh/s2n-bignum-dev.git +commit: ae84a59689cb50ad9b9c6e25cd34037d5b1fb2b4 +target: mlkem +imported-at: 2025-06-23T13:38:02+0000 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile new file mode 100644 index 0000000000..673806915f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile @@ -0,0 +1,39 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = mlkem_basemul_k2.o \ + mlkem_basemul_k3.o \ + mlkem_basemul_k4.o \ + mlkem_intt.o \ + mlkem_mulcache_compute.o \ + mlkem_ntt.o \ + mlkem_poly_reduce.o \ + mlkem_poly_tobytes.o \ + mlkem_poly_tomont.o \ + mlkem_rej_uniform_VARIABLE_TIME.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct unopt/*.o diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S new file mode 100644 index 0000000000..ff63953c1e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S @@ -0,0 +1,210 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache +// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words) +// +// The inputs a and b are considered as 2-element vectors of linear +// polynomials in the NTT domain (in Montgomery form), and the bt +// argument an analogous 2-element vector of mulcaches for the bi: +// +// a0 = a[0..255], a1 = a[256..511] +// b0 = b[0..255], b1 = b[256..511] +// bt0 = bt[0..127], bt1 = bt[128..255] +// +// Scalar multiplication of those 2-element vectors is performed, +// with base multiplication in Fq[X]/(X^2-zeta^i'), with zeta^i' +// being a power of zeta = 17, with i bit-reversed as used for NTTs, +// making use of the mulcache for optimization. +// +// All input elements are assumed <= 2^12 and the bts are +// assumed to be as computed by mlkem_mulcache_compute. +// +// extern void mlkem_basemul_k2 +// (int16_t r[256],const int16_t a[512],const int16_t b[512], +// const int16_t bt[256]) +// +// Standard ARM ABI: X0 = r, X1 = a, X2 = b, X3 = bt +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_basemul_k2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_basemul_k2) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_basemul_k2): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 + dup v0.8h, w14 + mov w14, #0xcff + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + mov x13, #0x10 + ldr q9, [x4], #0x20 + ldur q5, [x4, #-0x10] + ldr q11, [x5], #0x20 + uzp1 v23.8h, v9.8h, v5.8h + uzp2 v9.8h, v9.8h, v5.8h + ldr q5, [x2], #0x20 + ldur q7, [x5, #-0x10] + ldur q21, [x2, #-0x10] + uzp2 v10.8h, v11.8h, v7.8h + uzp1 v11.8h, v11.8h, v7.8h + uzp1 v7.8h, v5.8h, v21.8h + uzp2 v5.8h, v5.8h, v21.8h + ldr q21, [x1], #0x20 + ldur q25, [x1, #-0x10] + ld1 { v6.8h }, [x3], #16 + uzp1 v26.8h, v21.8h, v25.8h + uzp2 v21.8h, v21.8h, v25.8h + smull v25.4s, v26.4h, v5.4h + smull2 v5.4s, v26.8h, v5.8h + smull v19.4s, v26.4h, v7.4h + smull2 v26.4s, v26.8h, v7.8h + smlal v25.4s, v21.4h, v7.4h + smlal2 v5.4s, v21.8h, v7.8h + smlal v19.4s, v21.4h, v6.4h + smlal2 v26.4s, v21.8h, v6.8h + smlal v25.4s, v23.4h, v10.4h + smlal2 v5.4s, v23.8h, v10.8h + smlal v19.4s, v23.4h, v11.4h + smlal2 v26.4s, v23.8h, v11.8h + ld1 { v23.8h }, [x6], #16 + smlal v25.4s, v9.4h, v11.4h + smlal2 v5.4s, v9.8h, v11.8h + smlal2 v26.4s, v9.8h, v23.8h + smlal v19.4s, v9.4h, v23.4h + ldr q9, [x4], #0x20 + uzp1 v11.8h, v25.8h, v5.8h + uzp1 v23.8h, v19.8h, v26.8h + mul v11.8h, v11.8h, v2.8h + mul v23.8h, v23.8h, v2.8h + ldr q7, [x5], #0x20 + smlal2 v5.4s, v11.8h, v0.8h + smlal v25.4s, v11.4h, v0.4h + ldr q11, [x2], #0x20 + ldur q21, [x2, #-0x10] + ldur q6, [x4, #-0x10] + uzp1 v17.8h, v11.8h, v21.8h + ldr q10, [x1], #0x20 + ldur q29, [x1, #-0x10] + uzp2 v11.8h, v11.8h, v21.8h + uzp1 v13.8h, v9.8h, v6.8h + uzp1 v3.8h, v10.8h, v29.8h + uzp2 v10.8h, v10.8h, v29.8h + smull v12.4s, v3.4h, v11.4h + smull2 v11.4s, v3.8h, v11.8h + ldur q21, [x5, #-0x10] + smlal v12.4s, v10.4h, v17.4h + smlal2 v11.4s, v10.8h, v17.8h + uzp2 v29.8h, v7.8h, v21.8h + uzp1 v15.8h, v7.8h, v21.8h + smlal v12.4s, v13.4h, v29.4h + smlal2 v11.4s, v13.8h, v29.8h + uzp2 v28.8h, v9.8h, v6.8h + smlal2 v26.4s, v23.8h, v0.8h + smlal v12.4s, v28.4h, v15.4h + smlal2 v11.4s, v28.8h, v15.8h + smlal v19.4s, v23.4h, v0.4h + uzp2 v27.8h, v25.8h, v5.8h + smull v23.4s, v3.4h, v17.4h + uzp1 v9.8h, v12.8h, v11.8h + uzp2 v19.8h, v19.8h, v26.8h + mul v14.8h, v9.8h, v2.8h + ld1 { v22.8h }, [x6], #16 + zip2 v9.8h, v19.8h, v27.8h + smlal2 v11.4s, v14.8h, v0.8h + ld1 { v4.8h }, [x3], #16 + sub x13, x13, #0x2 + +mlkem_basemul_k2_loop: + smull2 v20.4s, v3.8h, v17.8h + ldr q18, [x4], #0x20 + ldr q30, [x5], #0x20 + smlal2 v20.4s, v10.8h, v4.8h + smlal v12.4s, v14.4h, v0.4h + smlal v23.4s, v10.4h, v4.4h + str q9, [x0, #0x10] + smlal2 v20.4s, v13.8h, v15.8h + ldr q8, [x2], #0x20 + smlal v23.4s, v13.4h, v15.4h + smlal2 v20.4s, v28.8h, v22.8h + zip1 v26.8h, v19.8h, v27.8h + ldur q9, [x2, #-0x10] + smlal v23.4s, v28.4h, v22.4h + uzp2 v27.8h, v12.8h, v11.8h + uzp1 v17.8h, v8.8h, v9.8h + uzp2 v4.8h, v8.8h, v9.8h + uzp1 v5.8h, v23.8h, v20.8h + str q26, [x0], #0x20 + mul v31.8h, v5.8h, v2.8h + ldur q19, [x4, #-0x10] + ldr q29, [x1], #0x20 + ldur q12, [x1, #-0x10] + smlal2 v20.4s, v31.8h, v0.8h + uzp1 v13.8h, v18.8h, v19.8h + uzp1 v3.8h, v29.8h, v12.8h + uzp2 v10.8h, v29.8h, v12.8h + smull v12.4s, v3.4h, v4.4h + smull2 v11.4s, v3.8h, v4.8h + ldur q5, [x5, #-0x10] + smlal v12.4s, v10.4h, v17.4h + smlal2 v11.4s, v10.8h, v17.8h + uzp2 v14.8h, v30.8h, v5.8h + uzp1 v15.8h, v30.8h, v5.8h + smlal v12.4s, v13.4h, v14.4h + smlal2 v11.4s, v13.8h, v14.8h + uzp2 v28.8h, v18.8h, v19.8h + smlal v23.4s, v31.4h, v0.4h + smlal v12.4s, v28.4h, v15.4h + smlal2 v11.4s, v28.8h, v15.8h + ld1 { v22.8h }, [x6], #16 + uzp2 v19.8h, v23.8h, v20.8h + uzp1 v1.8h, v12.8h, v11.8h + smull v23.4s, v3.4h, v17.4h + mul v14.8h, v1.8h, v2.8h + zip2 v9.8h, v19.8h, v27.8h + ld1 { v4.8h }, [x3], #16 + smlal2 v11.4s, v14.8h, v0.8h + sub x13, x13, #0x1 + cbnz x13, mlkem_basemul_k2_loop + smull2 v5.4s, v3.8h, v17.8h + smlal v12.4s, v14.4h, v0.4h + smlal v23.4s, v10.4h, v4.4h + str q9, [x0, #0x10] + smlal2 v5.4s, v10.8h, v4.8h + uzp2 v11.8h, v12.8h, v11.8h + zip1 v9.8h, v19.8h, v27.8h + smlal v23.4s, v13.4h, v15.4h + smlal2 v5.4s, v13.8h, v15.8h + str q9, [x0], #0x20 + smlal v23.4s, v28.4h, v22.4h + smlal2 v5.4s, v28.8h, v22.8h + uzp1 v9.8h, v23.8h, v5.8h + mul v9.8h, v9.8h, v2.8h + smlal2 v5.4s, v9.8h, v0.8h + smlal v23.4s, v9.4h, v0.4h + uzp2 v9.8h, v23.8h, v5.8h + zip2 v5.8h, v9.8h, v11.8h + zip1 v9.8h, v9.8h, v11.8h + str q5, [x0, #0x10] + str q9, [x0], #0x20 + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S new file mode 100644 index 0000000000..9c9d959341 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S @@ -0,0 +1,264 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache +// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words) +// +// The inputs a and b are considered as 3-element vectors of linear +// polynomials in the NTT domain (in Montgomery form), and the bt +// argument an analogous 3-element vector of mulcaches for the bi: +// +// a0 = a[0..255], a1 = a[256..511], a2 = a[512..767] +// b0 = b[0..255], b1 = b[256..511], b2 = b[512..767], +// bt0 = bt[0..127], bt1 = bt[128..255], bt2 = bt[256..383] +// +// Scalar multiplication of those 3-element vectors is performed, +// with base multiplication in Fq[X]/(X^2-zeta^i'), with zeta^i' +// being a power of zeta = 17, with i bit-reversed as used for NTTs, +// making use of the mulcache for optimization. +// +// All input elements are assumed <= 2^12 and the bts are +// assumed to be as computed by mlkem_mulcache_compute. +// +// extern void mlkem_basemul_k3 +// (int16_t r[256],const int16_t a[768],const int16_t b[768], +// const int16_t bt[384]) +// +// Standard ARM ABI: X0 = r, X1 = a, X2 = b, X3 = bt +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_basemul_k3) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_basemul_k3) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_basemul_k3): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 + dup v0.8h, w14 + mov w14, #0xcff + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + add x7, x1, #0x400 + add x8, x2, #0x400 + add x9, x3, #0x200 + mov x13, #0x10 + ldr q7, [x2, #0x10] + ldr q20, [x2], #0x20 + ldr q15, [x1, #0x10] + uzp1 v8.8h, v20.8h, v7.8h + uzp2 v7.8h, v20.8h, v7.8h + ld1 { v20.8h }, [x3], #16 + ldr q30, [x1], #0x20 + ldr q11, [x4], #0x20 + uzp1 v16.8h, v30.8h, v15.8h + uzp2 v15.8h, v30.8h, v15.8h + smull v30.4s, v16.4h, v7.4h + smull2 v7.4s, v16.8h, v7.8h + smull v9.4s, v16.4h, v8.4h + smull2 v16.4s, v16.8h, v8.8h + smlal v30.4s, v15.4h, v8.4h + smlal2 v7.4s, v15.8h, v8.8h + smlal v9.4s, v15.4h, v20.4h + smlal2 v16.4s, v15.8h, v20.8h + ldur q20, [x4, #-0x10] + ldr q15, [x5], #0x20 + uzp1 v8.8h, v11.8h, v20.8h + uzp2 v20.8h, v11.8h, v20.8h + ldur q11, [x5, #-0x10] + ld1 { v27.8h }, [x6], #16 + uzp1 v10.8h, v15.8h, v11.8h + uzp2 v15.8h, v15.8h, v11.8h + smlal v9.4s, v8.4h, v10.4h + smlal2 v16.4s, v8.8h, v10.8h + smlal v30.4s, v8.4h, v15.4h + smlal2 v7.4s, v8.8h, v15.8h + smlal v9.4s, v20.4h, v27.4h + smlal2 v16.4s, v20.8h, v27.8h + smlal v30.4s, v20.4h, v10.4h + smlal2 v7.4s, v20.8h, v10.8h + ldr q20, [x7], #0x20 + ldur q15, [x7, #-0x10] + ldr q8, [x8], #0x20 + uzp1 v11.8h, v20.8h, v15.8h + uzp2 v20.8h, v20.8h, v15.8h + ldur q15, [x8, #-0x10] + ld1 { v27.8h }, [x9], #16 + uzp1 v10.8h, v8.8h, v15.8h + uzp2 v15.8h, v8.8h, v15.8h + smlal v9.4s, v11.4h, v10.4h + smlal2 v16.4s, v11.8h, v10.8h + smlal v30.4s, v11.4h, v15.4h + smlal2 v7.4s, v11.8h, v15.8h + smlal v9.4s, v20.4h, v27.4h + smlal2 v16.4s, v20.8h, v27.8h + smlal v30.4s, v20.4h, v10.4h + smlal2 v7.4s, v20.8h, v10.8h + ldr q15, [x2], #0x20 + uzp1 v20.8h, v9.8h, v16.8h + uzp1 v8.8h, v30.8h, v7.8h + mul v20.8h, v20.8h, v2.8h + mul v8.8h, v8.8h, v2.8h + ldr q21, [x4], #0x20 + smlal v9.4s, v20.4h, v0.4h + smlal2 v16.4s, v20.8h, v0.8h + smlal v30.4s, v8.4h, v0.4h + smlal2 v7.4s, v8.8h, v0.8h + ldur q6, [x4, #-0x10] + uzp2 v27.8h, v9.8h, v16.8h + uzp2 v10.8h, v30.8h, v7.8h + ldur q16, [x2, #-0x10] + ldr q30, [x1, #0x10] + ld1 { v9.8h }, [x3], #16 + ldr q1, [x5], #0x20 + ldur q12, [x5, #-0x10] + ld1 { v24.8h }, [x6], #16 + ldr q19, [x7], #0x20 + ldur q31, [x7, #-0x10] + ldr q17, [x8], #0x20 + ldur q18, [x8, #-0x10] + ld1 { v25.8h }, [x9], #16 + sub x13, x13, #0x2 + +mlkem_basemul_k3_loop: + ldr q20, [x1], #0x20 + uzp1 v7.8h, v15.8h, v16.8h + uzp2 v15.8h, v15.8h, v16.8h + uzp1 v8.8h, v20.8h, v30.8h + uzp2 v20.8h, v20.8h, v30.8h + smull v30.4s, v8.4h, v15.4h + smull2 v15.4s, v8.8h, v15.8h + smull v11.4s, v8.4h, v7.4h + smull2 v8.4s, v8.8h, v7.8h + smlal v30.4s, v20.4h, v7.4h + smlal2 v15.4s, v20.8h, v7.8h + smlal v11.4s, v20.4h, v9.4h + smlal2 v8.4s, v20.8h, v9.8h + uzp1 v7.8h, v21.8h, v6.8h + uzp2 v20.8h, v21.8h, v6.8h + uzp1 v16.8h, v1.8h, v12.8h + uzp2 v9.8h, v1.8h, v12.8h + smlal v11.4s, v7.4h, v16.4h + smlal2 v8.4s, v7.8h, v16.8h + smlal v30.4s, v7.4h, v9.4h + smlal2 v15.4s, v7.8h, v9.8h + smlal v11.4s, v20.4h, v24.4h + smlal2 v8.4s, v20.8h, v24.8h + smlal v30.4s, v20.4h, v16.4h + smlal2 v15.4s, v20.8h, v16.8h + uzp1 v7.8h, v19.8h, v31.8h + uzp2 v20.8h, v19.8h, v31.8h + uzp1 v16.8h, v17.8h, v18.8h + uzp2 v9.8h, v17.8h, v18.8h + smlal v11.4s, v7.4h, v16.4h + smlal2 v8.4s, v7.8h, v16.8h + smlal v30.4s, v7.4h, v9.4h + smlal2 v15.4s, v7.8h, v9.8h + smlal v11.4s, v20.4h, v25.4h + smlal2 v8.4s, v20.8h, v25.8h + smlal v30.4s, v20.4h, v16.4h + smlal2 v15.4s, v20.8h, v16.8h + ldr q16, [x2, #0x10] + uzp1 v7.8h, v11.8h, v8.8h + uzp1 v20.8h, v30.8h, v15.8h + mul v7.8h, v7.8h, v2.8h + mul v20.8h, v20.8h, v2.8h + zip2 v9.8h, v27.8h, v10.8h + zip1 v27.8h, v27.8h, v10.8h + smlal v11.4s, v7.4h, v0.4h + smlal2 v8.4s, v7.8h, v0.8h + smlal v30.4s, v20.4h, v0.4h + smlal2 v15.4s, v20.8h, v0.8h + str q27, [x0], #0x20 + uzp2 v27.8h, v11.8h, v8.8h + stur q9, [x0, #-0x10] + uzp2 v10.8h, v30.8h, v15.8h + ldr q30, [x1, #0x10] + ldr q15, [x2], #0x20 + ld1 { v9.8h }, [x3], #16 + ldr q21, [x4], #0x20 + ldur q6, [x4, #-0x10] + ldr q1, [x5], #0x20 + ldur q12, [x5, #-0x10] + ld1 { v24.8h }, [x6], #16 + ldr q19, [x7], #0x20 + ldur q31, [x7, #-0x10] + ldr q17, [x8], #0x20 + ldur q18, [x8, #-0x10] + ld1 { v25.8h }, [x9], #16 + sub x13, x13, #0x1 + cbnz x13, mlkem_basemul_k3_loop + ldr q7, [x1], #0x20 + uzp1 v20.8h, v15.8h, v16.8h + uzp2 v15.8h, v15.8h, v16.8h + uzp1 v23.8h, v7.8h, v30.8h + uzp2 v11.8h, v7.8h, v30.8h + smull2 v8.4s, v23.8h, v20.8h + smull v5.4s, v23.4h, v20.4h + smull2 v30.4s, v23.8h, v15.8h + uzp1 v28.8h, v1.8h, v12.8h + smlal2 v8.4s, v11.8h, v9.8h + smlal v5.4s, v11.4h, v9.4h + uzp1 v3.8h, v21.8h, v6.8h + smull v16.4s, v23.4h, v15.4h + smlal2 v8.4s, v3.8h, v28.8h + smlal v5.4s, v3.4h, v28.4h + uzp2 v29.8h, v21.8h, v6.8h + uzp1 v7.8h, v17.8h, v18.8h + smlal2 v8.4s, v29.8h, v24.8h + uzp1 v14.8h, v19.8h, v31.8h + smlal v16.4s, v11.4h, v20.4h + smlal2 v30.4s, v11.8h, v20.8h + smlal2 v8.4s, v14.8h, v7.8h + uzp2 v20.8h, v1.8h, v12.8h + uzp2 v21.8h, v19.8h, v31.8h + smlal2 v30.4s, v3.8h, v20.8h + smlal v16.4s, v3.4h, v20.4h + smlal v5.4s, v29.4h, v24.4h + uzp2 v9.8h, v17.8h, v18.8h + smlal2 v30.4s, v29.8h, v28.8h + smlal v16.4s, v29.4h, v28.4h + smlal v5.4s, v14.4h, v7.4h + smlal2 v8.4s, v21.8h, v25.8h + smlal2 v30.4s, v14.8h, v9.8h + smlal v16.4s, v14.4h, v9.4h + smlal v5.4s, v21.4h, v25.4h + zip1 v20.8h, v27.8h, v10.8h + smlal2 v30.4s, v21.8h, v7.8h + smlal v16.4s, v21.4h, v7.4h + uzp1 v7.8h, v5.8h, v8.8h + str q20, [x0], #0x20 + mul v15.8h, v7.8h, v2.8h + uzp1 v7.8h, v16.8h, v30.8h + zip2 v31.8h, v27.8h, v10.8h + mul v20.8h, v7.8h, v2.8h + smlal v5.4s, v15.4h, v0.4h + smlal2 v8.4s, v15.8h, v0.8h + stur q31, [x0, #-0x10] + smlal2 v30.4s, v20.8h, v0.8h + smlal v16.4s, v20.4h, v0.4h + uzp2 v15.8h, v5.8h, v8.8h + uzp2 v20.8h, v16.8h, v30.8h + zip1 v7.8h, v15.8h, v20.8h + zip2 v20.8h, v15.8h, v20.8h + str q7, [x0], #0x20 + stur q20, [x0, #-0x10] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S new file mode 100644 index 0000000000..177e28f92e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S @@ -0,0 +1,318 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache +// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words) +// +// The inputs a and b are considered as 4-element vectors of linear +// polynomials in the NTT domain (in Montgomery form), and the bt +// argument an analogous 4-element vector of mulcaches for the bi: +// +// a0 = a[0..255], a1 = a[256..511], a2 = a[512..767], a3 = a[768..1023] +// b0 = b[0..255], b1 = b[256..511], b2 = b[512..767], b3 = b[768..1023] +// bt0 = bt[0..127], bt1 = bt[128..255], bt2 = bt[256..383], bt3 = bt[384..511] +// +// Scalar multiplication of those 4-element vectors is performed, +// with base multiplication in Fq[X]/(X^2-zeta^i'), with zeta^i' +// being a power of zeta = 17, with i bit-reversed as used for NTTs, +// making use of the mulcache for optimization. +// +// All input elements are assumed <= 2^12 and the bts are +// assumed to be as computed by mlkem_mulcache_compute. +// +// extern void mlkem_basemul_k4 +// (int16_t r[256],const int16_t a[1024],const int16_t b[1024], +// const int16_t bt[512]) +// +// Standard ARM ABI: X0 = r, X1 = a, X2 = b, X3 = bt +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_basemul_k4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_basemul_k4) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_basemul_k4): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 + dup v0.8h, w14 + mov w14, #0xcff + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + add x7, x1, #0x400 + add x8, x2, #0x400 + add x9, x3, #0x200 + add x10, x1, #0x600 + add x11, x2, #0x600 + add x12, x3, #0x300 + mov x13, #0x10 + ldr q23, [x2, #0x10] + ldr q19, [x2], #0x20 + ldr q17, [x5], #0x20 + uzp2 v13.8h, v19.8h, v23.8h + uzp1 v19.8h, v19.8h, v23.8h + ldur q23, [x5, #-0x10] + ldr q30, [x1, #0x10] + uzp2 v9.8h, v17.8h, v23.8h + uzp1 v23.8h, v17.8h, v23.8h + ldr q17, [x1], #0x20 + ldr q10, [x7, #0x10] + uzp1 v12.8h, v17.8h, v30.8h + uzp2 v17.8h, v17.8h, v30.8h + smull2 v30.4s, v12.8h, v13.8h + smull v13.4s, v12.4h, v13.4h + smull2 v22.4s, v12.8h, v19.8h + smull v12.4s, v12.4h, v19.4h + smlal2 v30.4s, v17.8h, v19.8h + smlal v13.4s, v17.4h, v19.4h + ldr q19, [x4], #0x20 + ldur q16, [x4, #-0x10] + ld1 { v8.8h }, [x3], #16 + uzp1 v26.8h, v19.8h, v16.8h + uzp2 v19.8h, v19.8h, v16.8h + smlal2 v30.4s, v26.8h, v9.8h + smlal v13.4s, v26.4h, v9.4h + smlal2 v22.4s, v17.8h, v8.8h + smlal v12.4s, v17.4h, v8.4h + smlal2 v30.4s, v19.8h, v23.8h + smlal v13.4s, v19.4h, v23.4h + smlal2 v22.4s, v26.8h, v23.8h + smlal v12.4s, v26.4h, v23.4h + ldr q23, [x7], #0x20 + ldr q17, [x8, #0x10] + uzp1 v9.8h, v23.8h, v10.8h + uzp2 v23.8h, v23.8h, v10.8h + ldr q10, [x10], #0x20 + ldur q16, [x10, #-0x10] + ld1 { v8.8h }, [x12], #16 + uzp1 v26.8h, v10.8h, v16.8h + uzp2 v10.8h, v10.8h, v16.8h + ld1 { v16.8h }, [x6], #16 + ldr q3, [x11, #0x10] + smlal2 v22.4s, v19.8h, v16.8h + smlal v12.4s, v19.4h, v16.4h + ldr q19, [x11], #0x20 + ld1 { v16.8h }, [x9], #16 + uzp1 v4.8h, v19.8h, v3.8h + uzp2 v19.8h, v19.8h, v3.8h + ldr q3, [x8], #0x20 + ldr q31, [x2], #0x20 + uzp1 v6.8h, v3.8h, v17.8h + uzp2 v17.8h, v3.8h, v17.8h + smlal2 v22.4s, v9.8h, v6.8h + smlal2 v30.4s, v9.8h, v17.8h + smlal v13.4s, v9.4h, v17.4h + smlal v12.4s, v9.4h, v6.4h + smlal2 v22.4s, v23.8h, v16.8h + smlal2 v30.4s, v23.8h, v6.8h + smlal v13.4s, v23.4h, v6.4h + smlal v12.4s, v23.4h, v16.4h + smlal2 v22.4s, v26.8h, v4.8h + smlal2 v30.4s, v26.8h, v19.8h + smlal v13.4s, v26.4h, v19.4h + smlal v12.4s, v26.4h, v4.4h + smlal2 v22.4s, v10.8h, v8.8h + smlal2 v30.4s, v10.8h, v4.8h + smlal v13.4s, v10.4h, v4.4h + smlal v12.4s, v10.4h, v8.4h + ldur q19, [x2, #-0x10] + uzp1 v23.8h, v13.8h, v30.8h + uzp1 v17.8h, v12.8h, v22.8h + mul v23.8h, v23.8h, v2.8h + uzp2 v21.8h, v31.8h, v19.8h + uzp1 v19.8h, v31.8h, v19.8h + mul v17.8h, v17.8h, v2.8h + smlal v13.4s, v23.4h, v0.4h + smlal2 v30.4s, v23.8h, v0.8h + ldr q23, [x5], #0x20 + smlal2 v22.4s, v17.8h, v0.8h + uzp2 v15.8h, v13.8h, v30.8h + smlal v12.4s, v17.4h, v0.4h + ldur q17, [x5, #-0x10] + ldr q13, [x1, #0x10] + uzp2 v27.8h, v23.8h, v17.8h + uzp1 v28.8h, v23.8h, v17.8h + uzp2 v7.8h, v12.8h, v22.8h + ldr q23, [x1], #0x20 + zip1 v5.8h, v7.8h, v15.8h + ldr q3, [x7, #0x10] + uzp1 v31.8h, v23.8h, v13.8h + uzp2 v16.8h, v23.8h, v13.8h + smull2 v24.4s, v31.8h, v21.8h + ldr q6, [x8, #0x10] + ldr q23, [x10], #0x20 + smlal2 v24.4s, v16.8h, v19.8h + ldur q17, [x10, #-0x10] + ld1 { v22.8h }, [x12], #16 + uzp1 v30.8h, v23.8h, v17.8h + uzp2 v11.8h, v23.8h, v17.8h + ldr q23, [x4], #0x20 + ldur q17, [x4, #-0x10] + ldr q4, [x7], #0x20 + uzp1 v20.8h, v23.8h, v17.8h + uzp2 v26.8h, v23.8h, v17.8h + uzp1 v9.8h, v4.8h, v3.8h + smlal2 v24.4s, v20.8h, v27.8h + ld1 { v8.8h }, [x6], #16 + ldr q25, [x11, #0x10] + ldr q29, [x11], #0x20 + ld1 { v12.8h }, [x9], #16 + uzp1 v10.8h, v29.8h, v25.8h + ldr q14, [x8], #0x20 + ld1 { v23.8h }, [x3], #16 + sub x13, x13, #0x2 + +mlkem_basemul_k4_loop: + smlal2 v24.4s, v26.8h, v28.8h + uzp2 v4.8h, v4.8h, v3.8h + smull2 v13.4s, v31.8h, v19.8h + ldr q3, [x2], #0x20 + uzp2 v1.8h, v29.8h, v25.8h + smlal2 v13.4s, v16.8h, v23.8h + ldur q17, [x2, #-0x10] + smull v18.4s, v31.4h, v19.4h + smlal2 v13.4s, v20.8h, v28.8h + smull v29.4s, v31.4h, v21.4h + ldr q21, [x5], #0x20 + smlal2 v13.4s, v26.8h, v8.8h + smlal v29.4s, v16.4h, v19.4h + ldur q19, [x5, #-0x10] + smlal v18.4s, v16.4h, v23.4h + smlal v29.4s, v20.4h, v27.4h + uzp1 v31.8h, v14.8h, v6.8h + uzp2 v27.8h, v21.8h, v19.8h + smlal v18.4s, v20.4h, v28.4h + ldr q25, [x1, #0x10] + smlal v29.4s, v26.4h, v28.4h + smlal v18.4s, v26.4h, v8.4h + uzp2 v26.8h, v14.8h, v6.8h + smlal2 v13.4s, v9.8h, v31.8h + smlal2 v24.4s, v9.8h, v26.8h + smlal v29.4s, v9.4h, v26.4h + smlal v18.4s, v9.4h, v31.4h + smlal2 v13.4s, v4.8h, v12.8h + smlal2 v24.4s, v4.8h, v31.8h + smlal v29.4s, v4.4h, v31.4h + smlal v18.4s, v4.4h, v12.4h + smlal2 v13.4s, v30.8h, v10.8h + smlal2 v24.4s, v30.8h, v1.8h + smlal v29.4s, v30.4h, v1.4h + smlal v18.4s, v30.4h, v10.4h + smlal2 v13.4s, v11.8h, v22.8h + smlal2 v24.4s, v11.8h, v10.8h + smlal v29.4s, v11.4h, v10.4h + smlal v18.4s, v11.4h, v22.4h + ldr q22, [x1], #0x20 + uzp1 v31.8h, v29.8h, v24.8h + uzp1 v28.8h, v21.8h, v19.8h + mul v19.8h, v31.8h, v2.8h + uzp1 v31.8h, v22.8h, v25.8h + uzp2 v16.8h, v22.8h, v25.8h + uzp2 v21.8h, v3.8h, v17.8h + smlal v29.4s, v19.4h, v0.4h + smlal2 v24.4s, v19.8h, v0.8h + uzp1 v19.8h, v3.8h, v17.8h + uzp1 v26.8h, v18.8h, v13.8h + zip2 v14.8h, v7.8h, v15.8h + mul v23.8h, v26.8h, v2.8h + uzp2 v15.8h, v29.8h, v24.8h + smull2 v24.4s, v31.8h, v21.8h + str q14, [x0, #0x10] + ldr q3, [x7, #0x10] + ldr q6, [x8, #0x10] + ldr q8, [x10], #0x20 + ldur q26, [x10, #-0x10] + ld1 { v22.8h }, [x12], #16 + uzp1 v30.8h, v8.8h, v26.8h + uzp2 v11.8h, v8.8h, v26.8h + ldr q8, [x4], #0x20 + ldur q26, [x4, #-0x10] + ldr q4, [x7], #0x20 + uzp1 v20.8h, v8.8h, v26.8h + uzp2 v26.8h, v8.8h, v26.8h + ld1 { v8.8h }, [x6], #16 + uzp1 v9.8h, v4.8h, v3.8h + ldr q25, [x11, #0x10] + ldr q29, [x11], #0x20 + ld1 { v12.8h }, [x9], #16 + ldr q14, [x8], #0x20 + smlal2 v24.4s, v16.8h, v19.8h + smlal2 v13.4s, v23.8h, v0.8h + smlal v18.4s, v23.4h, v0.4h + ld1 { v23.8h }, [x3], #16 + smlal2 v24.4s, v20.8h, v27.8h + uzp2 v7.8h, v18.8h, v13.8h + uzp1 v10.8h, v29.8h, v25.8h + str q5, [x0], #0x20 + zip1 v5.8h, v7.8h, v15.8h + sub x13, x13, #0x1 + cbnz x13, mlkem_basemul_k4_loop + smull2 v17.4s, v31.8h, v19.8h + uzp2 v1.8h, v14.8h, v6.8h + smull v18.4s, v31.4h, v21.4h + smlal2 v24.4s, v26.8h, v28.8h + smlal2 v17.4s, v16.8h, v23.8h + smull v21.4s, v31.4h, v19.4h + smlal v18.4s, v16.4h, v19.4h + uzp2 v31.8h, v4.8h, v3.8h + uzp1 v3.8h, v14.8h, v6.8h + smlal v21.4s, v16.4h, v23.4h + smlal v18.4s, v20.4h, v27.4h + uzp2 v14.8h, v29.8h, v25.8h + smlal2 v17.4s, v20.8h, v28.8h + smlal v21.4s, v20.4h, v28.4h + smlal v18.4s, v26.4h, v28.4h + smlal2 v24.4s, v9.8h, v1.8h + smlal2 v17.4s, v26.8h, v8.8h + smlal v21.4s, v26.4h, v8.4h + smlal v18.4s, v9.4h, v1.4h + smlal2 v24.4s, v31.8h, v3.8h + smlal2 v17.4s, v9.8h, v3.8h + smlal v21.4s, v9.4h, v3.4h + smlal v18.4s, v31.4h, v3.4h + smlal2 v24.4s, v30.8h, v14.8h + smlal2 v17.4s, v31.8h, v12.8h + smlal v21.4s, v31.4h, v12.4h + smlal v18.4s, v30.4h, v14.4h + smlal2 v24.4s, v11.8h, v10.8h + smlal2 v17.4s, v30.8h, v10.8h + smlal v21.4s, v30.4h, v10.4h + smlal v18.4s, v11.4h, v10.4h + zip2 v19.8h, v7.8h, v15.8h + smlal2 v17.4s, v11.8h, v22.8h + smlal v21.4s, v11.4h, v22.4h + uzp1 v23.8h, v18.8h, v24.8h + str q19, [x0, #0x10] + mul v19.8h, v23.8h, v2.8h + uzp1 v23.8h, v21.8h, v17.8h + str q5, [x0], #0x20 + mul v26.8h, v23.8h, v2.8h + smlal v18.4s, v19.4h, v0.4h + smlal2 v24.4s, v19.8h, v0.8h + smlal v21.4s, v26.4h, v0.4h + smlal2 v17.4s, v26.8h, v0.8h + uzp2 v13.8h, v18.8h, v24.8h + uzp2 v19.8h, v21.8h, v17.8h + zip1 v23.8h, v19.8h, v13.8h + zip2 v19.8h, v19.8h, v13.8h + str q23, [x0], #0x20 + stur q19, [x0, #-0x10] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S new file mode 100644 index 0000000000..1be62d9502 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S @@ -0,0 +1,412 @@ +// Copyright (c) 2022 Arm Limited +// Copyright (c) 2022 Hanno Becker +// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Inverse number-theoretic transform from ML-KEM +// Input a[256], z_01234[80], z_56[384] (all signed 16-bit words); output a[256] (signed 16-bit words). +// +// The transform is in-place with input and output a[256], with the input in +// bitreversed order and the output mapped into the Montgomery domain via +// x |-> (2^16 * x) mod 3329. The two other parameters are expected to point to +// tables of constants whose definitions can be found in the mlkem-native +// repo (mlkem/native/aarch64/src/aarch64_zetas.c) or our "tests/test.c". +// +// extern void mlkem_intt(int16_t a[256],const int16_t z_01234[80],const int16_t z_56[384]); +// +// Standard ARM ABI: X0 = a, X1 = z_01234, X2 = z_56 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_intt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_intt) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_intt): + +// This implementation is generated by SLOTHY, set up to optimize for +// the Neoverse N1 microarchitecture, starting from the clean version +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/intt_clean.S +// +// in the mlkem-native repository. + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov w5, #0xd01 + mov v7.h[0], w5 + mov w5, #0x4ebf + mov v7.h[1], w5 + mov w5, #0x200 + dup v29.8h, w5 + mov w5, #0x13b0 + dup v30.8h, w5 + mov x3, x0 + mov x4, #0x8 + +scale_start: + ldr q8, [x3] + ldr q9, [x3, #16] + ldr q10, [x3, #32] + ldr q11, [x3, #48] + sqrdmulh v27.8h, v8.8h, v30.8h + mul v8.8h, v8.8h, v29.8h + mls v8.8h, v27.8h, v7.h[0] + sqrdmulh v27.8h, v9.8h, v30.8h + mul v9.8h, v9.8h, v29.8h + mls v9.8h, v27.8h, v7.h[0] + sqrdmulh v27.8h, v10.8h, v30.8h + mul v10.8h, v10.8h, v29.8h + mls v10.8h, v27.8h, v7.h[0] + sqrdmulh v27.8h, v11.8h, v30.8h + mul v11.8h, v11.8h, v29.8h + mls v11.8h, v27.8h, v7.h[0] + str q8, [x3], #64 + stur q9, [x3, #-48] + stur q10, [x3, #-32] + stur q11, [x3, #-16] + subs x4, x4, #0x1 + cbnz x4, scale_start + + mov x3, x0 + mov x4, #0x8 + ldr q1, [x3, #32] + ldr q18, [x3, #48] + ldr q15, [x3] + ldr q21, [x3, #16] + ldr q3, [x2], #96 + ldur q16, [x2, #-48] + ldr q4, [x1], #16 + ldur q30, [x2, #-32] + trn1 v11.4s, v1.4s, v18.4s + trn2 v18.4s, v1.4s, v18.4s + trn1 v20.4s, v15.4s, v21.4s + trn2 v1.4s, v15.4s, v21.4s + ldur q0, [x2, #-16] + ldur q22, [x2, #-80] + trn1 v8.2d, v20.2d, v11.2d + trn1 v6.2d, v1.2d, v18.2d + trn2 v1.2d, v1.2d, v18.2d + trn2 v21.2d, v20.2d, v11.2d + sub v11.8h, v8.8h, v6.8h + add v20.8h, v8.8h, v6.8h + add v14.8h, v21.8h, v1.8h + sub v15.8h, v21.8h, v1.8h + sqrdmulh v16.8h, v11.8h, v16.8h + ldur q6, [x2, #-64] + sub v18.8h, v20.8h, v14.8h + add v21.8h, v20.8h, v14.8h + sqrdmulh v0.8h, v15.8h, v0.8h + mul v11.8h, v11.8h, v6.8h + mul v1.8h, v15.8h, v30.8h + mls v11.8h, v16.8h, v7.h[0] + mls v1.8h, v0.8h, v7.h[0] + sqrdmulh v0.8h, v18.8h, v22.8h + mul v16.8h, v18.8h, v3.8h + sub v18.8h, v11.8h, v1.8h + add v13.8h, v11.8h, v1.8h + sqrdmulh v11.8h, v18.8h, v22.8h + trn1 v20.4s, v21.4s, v13.4s + trn2 v1.4s, v21.4s, v13.4s + mls v16.8h, v0.8h, v7.h[0] + mul v3.8h, v18.8h, v3.8h + mls v3.8h, v11.8h, v7.h[0] + trn2 v11.4s, v16.4s, v3.4s + trn1 v16.4s, v16.4s, v3.4s + trn2 v21.2d, v1.2d, v11.2d + trn2 v0.2d, v20.2d, v16.2d + trn1 v1.2d, v1.2d, v11.2d + trn1 v11.2d, v20.2d, v16.2d + sub v13.8h, v0.8h, v21.8h + add v29.8h, v0.8h, v21.8h + add v9.8h, v11.8h, v1.8h + sub v23.8h, v11.8h, v1.8h + sqdmulh v1.8h, v29.8h, v7.h[1] + sqdmulh v27.8h, v9.8h, v7.h[1] + sqrdmulh v16.8h, v13.8h, v4.h[5] + srshr v14.8h, v1.8h, #11 + sub x4, x4, #0x1 + +layer3456_start: + mls v29.8h, v14.8h, v7.h[0] + ldr q3, [x3, #96] + ldr q30, [x3, #112] + ldr q20, [x2, #32] + mul v6.8h, v23.8h, v4.h[2] + ldr q24, [x2, #16] + ldr q0, [x3, #64] + ldr q14, [x3, #80] + srshr v25.8h, v27.8h, #11 + mul v15.8h, v13.8h, v4.h[4] + trn1 v18.4s, v3.4s, v30.4s + ldr q28, [x2], #96 + trn2 v5.4s, v3.4s, v30.4s + sqrdmulh v19.8h, v23.8h, v4.h[3] + trn2 v26.4s, v0.4s, v14.4s + trn1 v11.4s, v0.4s, v14.4s + mls v9.8h, v25.8h, v7.h[0] + trn2 v0.2d, v26.2d, v5.2d + ldur q17, [x2, #-16] + mls v15.8h, v16.8h, v7.h[0] + trn2 v23.2d, v11.2d, v18.2d + trn1 v30.2d, v26.2d, v5.2d + ldur q1, [x2, #-32] + mls v6.8h, v19.8h, v7.h[0] + sub v14.8h, v23.8h, v0.8h + trn1 v19.2d, v11.2d, v18.2d + ldur q10, [x2, #-48] + add v31.8h, v23.8h, v0.8h + sqrdmulh v27.8h, v14.8h, v17.8h + add v18.8h, v19.8h, v30.8h + mul v13.8h, v14.8h, v1.8h + sub v22.8h, v19.8h, v30.8h + sub v26.8h, v18.8h, v31.8h + sqrdmulh v10.8h, v22.8h, v10.8h + sub v25.8h, v9.8h, v29.8h + add v9.8h, v9.8h, v29.8h + mls v13.8h, v27.8h, v7.h[0] + add v2.8h, v18.8h, v31.8h + str q9, [x3], #64 + sub v23.8h, v6.8h, v15.8h + add v21.8h, v6.8h, v15.8h + mul v3.8h, v22.8h, v20.8h + mls v3.8h, v10.8h, v7.h[0] + sqrdmulh v20.8h, v26.8h, v24.8h + mul v22.8h, v26.8h, v28.8h + add v12.8h, v3.8h, v13.8h + sub v15.8h, v3.8h, v13.8h + sqrdmulh v13.8h, v23.8h, v4.h[1] + sqrdmulh v0.8h, v15.8h, v24.8h + mul v27.8h, v15.8h, v28.8h + mls v22.8h, v20.8h, v7.h[0] + mls v27.8h, v0.8h, v7.h[0] + sqdmulh v19.8h, v21.8h, v7.h[1] + trn1 v26.4s, v2.4s, v12.4s + mul v10.8h, v23.8h, v4.h[0] + trn2 v2.4s, v2.4s, v12.4s + trn2 v12.4s, v22.4s, v27.4s + trn1 v8.4s, v22.4s, v27.4s + mul v31.8h, v25.8h, v4.h[0] + trn2 v3.2d, v2.2d, v12.2d + sqrdmulh v11.8h, v25.8h, v4.h[1] + trn2 v0.2d, v26.2d, v8.2d + srshr v16.8h, v19.8h, #11 + ldr q4, [x1], #16 + mls v10.8h, v13.8h, v7.h[0] + add v29.8h, v0.8h, v3.8h + trn1 v18.2d, v26.2d, v8.2d + trn1 v20.2d, v2.2d, v12.2d + sqdmulh v15.8h, v29.8h, v7.h[1] + sub v13.8h, v0.8h, v3.8h + mls v21.8h, v16.8h, v7.h[0] + add v9.8h, v18.8h, v20.8h + stur q10, [x3, #-16] + sub v23.8h, v18.8h, v20.8h + mls v31.8h, v11.8h, v7.h[0] + srshr v14.8h, v15.8h, #11 + sqrdmulh v16.8h, v13.8h, v4.h[5] + stur q21, [x3, #-48] + sqdmulh v27.8h, v9.8h, v7.h[1] + stur q31, [x3, #-32] + sub x4, x4, #0x1 + cbnz x4, layer3456_start + + mls v29.8h, v14.8h, v7.h[0] + srshr v1.8h, v27.8h, #11 + mul v11.8h, v13.8h, v4.h[4] + mls v9.8h, v1.8h, v7.h[0] + sqrdmulh v1.8h, v23.8h, v4.h[3] + mul v20.8h, v23.8h, v4.h[2] + sub v21.8h, v9.8h, v29.8h + add v0.8h, v9.8h, v29.8h + mls v11.8h, v16.8h, v7.h[0] + mls v20.8h, v1.8h, v7.h[0] + str q0, [x3], #64 + mul v1.8h, v21.8h, v4.h[0] + sqrdmulh v16.8h, v21.8h, v4.h[1] + add v21.8h, v20.8h, v11.8h + sub v11.8h, v20.8h, v11.8h + sqdmulh v20.8h, v21.8h, v7.h[1] + sqrdmulh v0.8h, v11.8h, v4.h[1] + mul v11.8h, v11.8h, v4.h[0] + srshr v20.8h, v20.8h, #11 + mls v1.8h, v16.8h, v7.h[0] + mls v11.8h, v0.8h, v7.h[0] + mls v21.8h, v20.8h, v7.h[0] + stur q1, [x3, #-32] + stur q11, [x3, #-16] + stur q21, [x3, #-48] + mov x4, #0x4 + ldr q0, [x1], #32 + ldur q1, [x1, #-16] + ldr q6, [x0, #64] + ldr q16, [x0] + ldr q18, [x0, #192] + ldr q27, [x0, #128] + ldr q26, [x0, #320] + ldr q5, [x0, #256] + ldr q4, [x0, #448] + ldr q2, [x0, #384] + add v12.8h, v16.8h, v6.8h + sub v11.8h, v16.8h, v6.8h + add v3.8h, v27.8h, v18.8h + sub v21.8h, v27.8h, v18.8h + sub v18.8h, v5.8h, v26.8h + mul v10.8h, v11.8h, v0.h[6] + add v24.8h, v5.8h, v26.8h + sqrdmulh v27.8h, v18.8h, v1.h[3] + sub v19.8h, v12.8h, v3.8h + add v29.8h, v12.8h, v3.8h + mul v14.8h, v18.8h, v1.h[2] + sub v13.8h, v2.8h, v4.8h + sqrdmulh v31.8h, v21.8h, v1.h[1] + sqrdmulh v26.8h, v11.8h, v0.h[7] + mul v21.8h, v21.8h, v1.h[0] + sub x4, x4, #0x1 + +layer012_start: + mls v14.8h, v27.8h, v7.h[0] + ldr q15, [x0, #16] + ldr q9, [x0, #208] + add v18.8h, v2.8h, v4.8h + mul v17.8h, v13.8h, v1.h[4] + ldr q20, [x0, #80] + ldr q2, [x0, #400] + ldr q5, [x0, #272] + sub v11.8h, v24.8h, v18.8h + sqrdmulh v8.8h, v13.8h, v1.h[5] + ldr q23, [x0, #336] + sqrdmulh v16.8h, v11.8h, v0.h[5] + sub v12.8h, v15.8h, v20.8h + ldr q3, [x0, #144] + add v28.8h, v15.8h, v20.8h + add v4.8h, v24.8h, v18.8h + mul v30.8h, v11.8h, v0.h[4] + sub v20.8h, v5.8h, v23.8h + add v24.8h, v5.8h, v23.8h + mls v17.8h, v8.8h, v7.h[0] + sub v11.8h, v29.8h, v4.8h + mls v30.8h, v16.8h, v7.h[0] + sqrdmulh v27.8h, v20.8h, v1.h[3] + add v16.8h, v14.8h, v17.8h + sub v13.8h, v14.8h, v17.8h + sqrdmulh v23.8h, v19.8h, v0.h[3] + sub v25.8h, v3.8h, v9.8h + add v5.8h, v3.8h, v9.8h + mul v6.8h, v19.8h, v0.h[2] + mul v8.8h, v11.8h, v0.h[0] + mls v10.8h, v26.8h, v7.h[0] + sqrdmulh v26.8h, v12.8h, v0.h[7] + mul v14.8h, v20.8h, v1.h[2] + mul v22.8h, v13.8h, v0.h[4] + mls v21.8h, v31.8h, v7.h[0] + sqrdmulh v9.8h, v11.8h, v0.h[1] + sqrdmulh v20.8h, v13.8h, v0.h[5] + sub v13.8h, v10.8h, v21.8h + add v15.8h, v10.8h, v21.8h + sqrdmulh v31.8h, v25.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[3] + sub v18.8h, v15.8h, v16.8h + add v3.8h, v15.8h, v16.8h + add v4.8h, v29.8h, v4.8h + mls v22.8h, v20.8h, v7.h[0] + sub v19.8h, v28.8h, v5.8h + mls v6.8h, v23.8h, v7.h[0] + str q4, [x0], #16 + mul v29.8h, v13.8h, v0.h[2] + mls v29.8h, v21.8h, v7.h[0] + add v11.8h, v6.8h, v30.8h + mls v8.8h, v9.8h, v7.h[0] + str q11, [x0, #112] + sub v11.8h, v6.8h, v30.8h + sqrdmulh v21.8h, v18.8h, v0.h[1] + sqrdmulh v4.8h, v11.8h, v0.h[1] + str q8, [x0, #240] + sub v16.8h, v29.8h, v22.8h + str q3, [x0, #48] + mul v20.8h, v11.8h, v0.h[0] + sqrdmulh v11.8h, v16.8h, v0.h[1] + mls v20.8h, v4.8h, v7.h[0] + mul v23.8h, v16.8h, v0.h[0] + mls v23.8h, v11.8h, v7.h[0] + str q20, [x0, #368] + mul v11.8h, v18.8h, v0.h[0] + mls v11.8h, v21.8h, v7.h[0] + str q23, [x0, #432] + ldr q4, [x0, #448] + mul v10.8h, v12.8h, v0.h[6] + add v12.8h, v29.8h, v22.8h + add v29.8h, v28.8h, v5.8h + mul v21.8h, v25.8h, v1.h[0] + str q12, [x0, #176] + str q11, [x0, #304] + sub v13.8h, v2.8h, v4.8h + sub x4, x4, #0x1 + cbnz x4, layer012_start + + mls v21.8h, v31.8h, v7.h[0] + add v22.8h, v2.8h, v4.8h + sqrdmulh v15.8h, v13.8h, v1.h[5] + add v8.8h, v24.8h, v22.8h + sub v17.8h, v29.8h, v8.8h + mul v28.8h, v13.8h, v1.h[4] + add v29.8h, v29.8h, v8.8h + sub v13.8h, v24.8h, v22.8h + sqrdmulh v25.8h, v17.8h, v0.h[1] + str q29, [x0], #16 + mls v28.8h, v15.8h, v7.h[0] + mls v10.8h, v26.8h, v7.h[0] + mul v29.8h, v17.8h, v0.h[0] + mls v29.8h, v25.8h, v7.h[0] + mls v14.8h, v27.8h, v7.h[0] + sqrdmulh v20.8h, v13.8h, v0.h[5] + str q29, [x0, #240] + mul v4.8h, v13.8h, v0.h[4] + add v12.8h, v10.8h, v21.8h + add v22.8h, v14.8h, v28.8h + sub v8.8h, v10.8h, v21.8h + sqrdmulh v11.8h, v19.8h, v0.h[3] + add v6.8h, v12.8h, v22.8h + sub v3.8h, v14.8h, v28.8h + mls v4.8h, v20.8h, v7.h[0] + str q6, [x0, #48] + sub v16.8h, v12.8h, v22.8h + mul v12.8h, v19.8h, v0.h[2] + mul v14.8h, v3.8h, v0.h[4] + sqrdmulh v22.8h, v3.8h, v0.h[5] + mls v12.8h, v11.8h, v7.h[0] + mul v20.8h, v8.8h, v0.h[2] + mls v14.8h, v22.8h, v7.h[0] + add v5.8h, v12.8h, v4.8h + sub v21.8h, v12.8h, v4.8h + sqrdmulh v4.8h, v8.8h, v0.h[3] + str q5, [x0, #112] + sqrdmulh v9.8h, v21.8h, v0.h[1] + mul v19.8h, v21.8h, v0.h[0] + mls v20.8h, v4.8h, v7.h[0] + mls v19.8h, v9.8h, v7.h[0] + sqrdmulh v9.8h, v16.8h, v0.h[1] + sub v5.8h, v20.8h, v14.8h + add v4.8h, v20.8h, v14.8h + mul v20.8h, v16.8h, v0.h[0] + str q4, [x0, #176] + sqrdmulh v18.8h, v5.8h, v0.h[1] + str q19, [x0, #368] + mul v23.8h, v5.8h, v0.h[0] + mls v20.8h, v9.8h, v7.h[0] + mls v23.8h, v18.8h, v7.h[0] + str q20, [x0, #304] + str q23, [x0, #432] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S new file mode 100644 index 0000000000..cebd18dcb2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S @@ -0,0 +1,67 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Precompute the mulcache data for a polynomial in the NTT domain +// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words) +// +// The input array a is assumed to represent 128 linear polynomials +// in the NTT domain, p_i = a[2i] + a[2i+1] * X where each p_i is in +// Fq[X]/(X^2-zeta^i'), with zeta^i' being a power of zeta = 17, with i +// bit-reversed as used for NTTs. For each such polynomial, the mulcache +// value is a[2i+1] * zeta^i' (modulo 3329 as usual), a value useful to +// perform base multiplication of polynomials efficiently. The two other +// table arguments z = zetas and t = twisted zetas are expected to point +// to tables of zeta-related constants whose definitions can be found in +// the mlkem-native repo (mlkem/native/aarch64/src/aarch64_zetas.c) or +// our "tests/test.c", as "mulcache_zetas" and "mulcache_zetas_twisted" +// +// extern void mlkem_mulcache_compute +// (int16_t x[128],const int16_t a[256], +// const int16_t z[128],const int16_t t[128]); +// +// Standard ARM ABI: X0 = x, X1 = a, X2 = z, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_mulcache_compute) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_mulcache_compute) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_mulcache_compute): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_mulcache_compute_asm.S + + mov w5, #0xd01 + dup v6.8h, w5 + mov w5, #0x4ebf + dup v7.8h, w5 + mov x4, #0x10 + ldr q1, [x1, #0x10] + ldr q27, [x1], #0x20 + ldr q23, [x2], #0x10 + uzp2 v27.8h, v27.8h, v1.8h + ldr q1, [x3], #0x10 + mul v2.8h, v27.8h, v23.8h + sqrdmulh v27.8h, v27.8h, v1.8h + sub x4, x4, #0x1 + +mlkem_mulcache_compute_loop: + ldr q29, [x1, #0x10] + ldr q21, [x2], #0x10 + mls v2.8h, v27.8h, v6.h[0] + ldr q27, [x1], #0x20 + ldr q7, [x3], #0x10 + uzp2 v28.8h, v27.8h, v29.8h + str q2, [x0], #0x10 + mul v2.8h, v28.8h, v21.8h + sqrdmulh v27.8h, v28.8h, v7.8h + sub x4, x4, #0x1 + cbnz x4, mlkem_mulcache_compute_loop + + mls v2.8h, v27.8h, v6.h[0] + str q2, [x0], #0x10 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S new file mode 100644 index 0000000000..fda5504d7c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S @@ -0,0 +1,363 @@ +// Copyright (c) 2022 Arm Limited +// Copyright (c) 2022 Hanno Becker +// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Forward number-theoretic transform from ML-KEM +// Input a[256], z_01234[80], z_56[384] (all signed 16-bit words); output a[256] (signed 16-bit words). +// +// The transform is in-place with input and output a[256], with the output +// in bitreversed order. The two other parameters are expected to point to +// tables of constants whose definitions can be found in the mlkem-native +// repo (mlkem/native/aarch64/src/aarch64_zetas.c) or our "tests/test.c". +// +// extern void mlkem_ntt(int16_t a[256],const int16_t z_01234[80],const int16_t z_56[384]); +// +// Standard ARM ABI: X0 = a, X1 = z_01234, X2 = z_56 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_ntt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_ntt) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_ntt): + +// This implementation is generated by SLOTHY, set up to optimize for +// the Neoverse N1 microarchitecture, starting from the clean version +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/ntt_clean.S +// +// in the mlkem-native repository. + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov w5, #0xd01 + mov v7.h[0], w5 + mov w5, #0x4ebf + mov v7.h[1], w5 + mov x3, x0 + mov x4, #0x4 + ldr q0, [x1], #32 + ldur q1, [x1, #-16] + ldr q26, [x0, #384] + ldr q14, [x0, #256] + ldr q12, [x0, #64] + ldr q4, [x0, #192] + ldr q11, [x0] + ldr q22, [x0, #320] + ldr q10, [x0, #448] + ldr q28, [x0, #128] + sqrdmulh v23.8h, v26.8h, v0.h[1] + mul v26.8h, v26.8h, v0.h[0] + sqrdmulh v24.8h, v14.8h, v0.h[1] + mul v9.8h, v14.8h, v0.h[0] + sqrdmulh v14.8h, v22.8h, v0.h[1] + mul v22.8h, v22.8h, v0.h[0] + sqrdmulh v29.8h, v10.8h, v0.h[1] + mls v22.8h, v14.8h, v7.h[0] + mls v26.8h, v23.8h, v7.h[0] + mul v14.8h, v10.8h, v0.h[0] + add v10.8h, v12.8h, v22.8h + sub v12.8h, v12.8h, v22.8h + mls v14.8h, v29.8h, v7.h[0] + sub v22.8h, v28.8h, v26.8h + add v23.8h, v28.8h, v26.8h + mls v9.8h, v24.8h, v7.h[0] + sqrdmulh v26.8h, v22.8h, v0.h[5] + sub v28.8h, v4.8h, v14.8h + add v14.8h, v4.8h, v14.8h + mul v13.8h, v22.8h, v0.h[4] + sub v18.8h, v11.8h, v9.8h + sqrdmulh v4.8h, v28.8h, v0.h[5] + sqrdmulh v22.8h, v14.8h, v0.h[3] + mul v28.8h, v28.8h, v0.h[4] + mls v28.8h, v4.8h, v7.h[0] + mul v14.8h, v14.8h, v0.h[2] + mls v14.8h, v22.8h, v7.h[0] + add v4.8h, v12.8h, v28.8h + sub v12.8h, v12.8h, v28.8h + mls v13.8h, v26.8h, v7.h[0] + sqrdmulh v26.8h, v4.8h, v1.h[3] + sub v22.8h, v10.8h, v14.8h + add v19.8h, v10.8h, v14.8h + mul v3.8h, v4.8h, v1.h[2] + sqrdmulh v14.8h, v22.8h, v1.h[1] + mul v28.8h, v22.8h, v1.h[0] + mls v3.8h, v26.8h, v7.h[0] + mls v28.8h, v14.8h, v7.h[0] + sqrdmulh v14.8h, v12.8h, v1.h[5] + mul v26.8h, v12.8h, v1.h[4] + sqrdmulh v21.8h, v19.8h, v0.h[7] + mls v26.8h, v14.8h, v7.h[0] + sub x4, x4, #0x1 + +layer123_start: + ldr q17, [x0, #400] + mul v22.8h, v19.8h, v0.h[6] + ldr q24, [x0, #272] + add v19.8h, v18.8h, v13.8h + mls v22.8h, v21.8h, v7.h[0] + sub v21.8h, v18.8h, v13.8h + ldr q13, [x0, #208] + ldr q18, [x0, #80] + sqrdmulh v14.8h, v17.8h, v0.h[1] + ldr q30, [x0, #336] + ldr q4, [x0, #464] + add v2.8h, v11.8h, v9.8h + sqrdmulh v27.8h, v23.8h, v0.h[3] + ldr q11, [x0, #16] + sub v16.8h, v21.8h, v26.8h + ldr q6, [x0, #144] + sub v29.8h, v19.8h, v3.8h + mul v15.8h, v23.8h, v0.h[2] + add v12.8h, v21.8h, v26.8h + sqrdmulh v26.8h, v30.8h, v0.h[1] + add v3.8h, v19.8h, v3.8h + str q29, [x0, #320] + mls v15.8h, v27.8h, v7.h[0] + str q3, [x0, #256] + str q12, [x0, #384] + mul v10.8h, v17.8h, v0.h[0] + str q16, [x0, #448] + mls v10.8h, v14.8h, v7.h[0] + sub v17.8h, v2.8h, v15.8h + mul v12.8h, v30.8h, v0.h[0] + sub v16.8h, v17.8h, v28.8h + mls v12.8h, v26.8h, v7.h[0] + sub v25.8h, v6.8h, v10.8h + str q16, [x0, #192] + sqrdmulh v8.8h, v24.8h, v0.h[1] + add v23.8h, v6.8h, v10.8h + sqrdmulh v14.8h, v4.8h, v0.h[1] + add v16.8h, v18.8h, v12.8h + mul v9.8h, v24.8h, v0.h[0] + sub v5.8h, v18.8h, v12.8h + add v26.8h, v17.8h, v28.8h + mul v21.8h, v4.8h, v0.h[0] + add v27.8h, v2.8h, v15.8h + str q26, [x0, #128] + mls v21.8h, v14.8h, v7.h[0] + sub v12.8h, v27.8h, v22.8h + add v29.8h, v27.8h, v22.8h + mls v9.8h, v8.8h, v7.h[0] + str q12, [x0, #64] + str q29, [x0], #16 + sqrdmulh v18.8h, v25.8h, v0.h[5] + sub v26.8h, v13.8h, v21.8h + add v21.8h, v13.8h, v21.8h + mul v13.8h, v25.8h, v0.h[4] + sqrdmulh v28.8h, v26.8h, v0.h[5] + sqrdmulh v15.8h, v21.8h, v0.h[3] + mul v31.8h, v26.8h, v0.h[4] + mls v31.8h, v28.8h, v7.h[0] + mul v27.8h, v21.8h, v0.h[2] + mls v27.8h, v15.8h, v7.h[0] + add v10.8h, v5.8h, v31.8h + mls v13.8h, v18.8h, v7.h[0] + sub v18.8h, v11.8h, v9.8h + sub v14.8h, v5.8h, v31.8h + sqrdmulh v25.8h, v10.8h, v1.h[3] + sub v26.8h, v16.8h, v27.8h + add v19.8h, v16.8h, v27.8h + mul v3.8h, v10.8h, v1.h[2] + sqrdmulh v29.8h, v26.8h, v1.h[1] + mul v28.8h, v26.8h, v1.h[0] + mls v3.8h, v25.8h, v7.h[0] + mls v28.8h, v29.8h, v7.h[0] + sqrdmulh v4.8h, v14.8h, v1.h[5] + mul v26.8h, v14.8h, v1.h[4] + sqrdmulh v21.8h, v19.8h, v0.h[7] + mls v26.8h, v4.8h, v7.h[0] + sub x4, x4, #0x1 + cbnz x4, layer123_start + + add v20.8h, v18.8h, v13.8h + sqrdmulh v16.8h, v23.8h, v0.h[3] + sub v8.8h, v18.8h, v13.8h + sub v5.8h, v20.8h, v3.8h + mul v27.8h, v23.8h, v0.h[2] + add v31.8h, v11.8h, v9.8h + add v14.8h, v8.8h, v26.8h + sub v26.8h, v8.8h, v26.8h + str q5, [x0, #320] + mls v27.8h, v16.8h, v7.h[0] + str q14, [x0, #384] + mul v25.8h, v19.8h, v0.h[6] + str q26, [x0, #448] + mls v25.8h, v21.8h, v7.h[0] + sub v26.8h, v31.8h, v27.8h + add v15.8h, v20.8h, v3.8h + add v29.8h, v31.8h, v27.8h + sub v22.8h, v26.8h, v28.8h + str q15, [x0, #256] + add v26.8h, v26.8h, v28.8h + sub v27.8h, v29.8h, v25.8h + add v20.8h, v29.8h, v25.8h + str q26, [x0, #128] + str q22, [x0, #192] + str q27, [x0, #64] + str q20, [x0], #16 + mov x0, x3 + mov x4, #0x8 + ldr q11, [x1], #16 + ldr q9, [x0, #48] + ldr q30, [x0, #32] + ldr q12, [x2], #96 + ldur q5, [x2, #-64] + sqrdmulh v23.8h, v9.8h, v11.h[1] + ldr q15, [x0] + mul v6.8h, v9.8h, v11.h[0] + ldr q28, [x0, #16] + sqrdmulh v10.8h, v30.8h, v11.h[1] + mls v6.8h, v23.8h, v7.h[0] + mul v0.8h, v30.8h, v11.h[0] + mls v0.8h, v10.8h, v7.h[0] + ldur q21, [x2, #-48] + add v27.8h, v28.8h, v6.8h + sub v25.8h, v28.8h, v6.8h + mul v6.8h, v27.8h, v11.h[2] + sqrdmulh v17.8h, v27.8h, v11.h[3] + sqrdmulh v19.8h, v25.8h, v11.h[5] + ldur q18, [x2, #-80] + mul v22.8h, v25.8h, v11.h[4] + mls v6.8h, v17.8h, v7.h[0] + mls v22.8h, v19.8h, v7.h[0] + sub x4, x4, #0x1 + +layer4567_start: + ldr q27, [x0, #112] + ldr q4, [x1], #16 + sub v1.8h, v15.8h, v0.8h + ldr q30, [x0, #96] + add v23.8h, v15.8h, v0.8h + add v11.8h, v1.8h, v22.8h + ldur q10, [x2, #-16] + sub v24.8h, v23.8h, v6.8h + sqrdmulh v25.8h, v27.8h, v4.h[1] + add v9.8h, v23.8h, v6.8h + sub v16.8h, v1.8h, v22.8h + mul v31.8h, v27.8h, v4.h[0] + trn2 v14.4s, v9.4s, v24.4s + sqrdmulh v1.8h, v30.8h, v4.h[1] + trn2 v8.4s, v11.4s, v16.4s + trn1 v11.4s, v11.4s, v16.4s + trn2 v19.2d, v14.2d, v8.2d + mls v31.8h, v25.8h, v7.h[0] + ldr q15, [x0, #80] + trn1 v2.4s, v9.4s, v24.4s + sqrdmulh v17.8h, v19.8h, v18.8h + trn1 v9.2d, v14.2d, v8.2d + mul v29.8h, v19.8h, v12.8h + trn2 v27.2d, v2.2d, v11.2d + add v0.8h, v15.8h, v31.8h + sub v16.8h, v15.8h, v31.8h + sqrdmulh v13.8h, v27.8h, v18.8h + mls v29.8h, v17.8h, v7.h[0] + ldur q17, [x2, #-32] + ldr q15, [x0, #64] + mul v24.8h, v27.8h, v12.8h + trn1 v19.2d, v2.2d, v11.2d + ldr q18, [x2, #16] + mul v6.8h, v0.8h, v4.h[2] + ldr q12, [x2], #96 + add v25.8h, v9.8h, v29.8h + mls v24.8h, v13.8h, v7.h[0] + sub v20.8h, v9.8h, v29.8h + sqrdmulh v23.8h, v25.8h, v21.8h + mul v3.8h, v25.8h, v5.8h + add v13.8h, v19.8h, v24.8h + sqrdmulh v27.8h, v20.8h, v10.8h + mls v3.8h, v23.8h, v7.h[0] + sub v29.8h, v19.8h, v24.8h + mul v26.8h, v20.8h, v17.8h + mls v26.8h, v27.8h, v7.h[0] + add v27.8h, v13.8h, v3.8h + sqrdmulh v14.8h, v0.8h, v4.h[3] + sub v19.8h, v13.8h, v3.8h + ldur q5, [x2, #-64] + mul v0.8h, v30.8h, v4.h[0] + trn1 v8.4s, v27.4s, v19.4s + sub v21.8h, v29.8h, v26.8h + add v11.8h, v29.8h, v26.8h + trn2 v30.4s, v27.4s, v19.4s + sqrdmulh v19.8h, v16.8h, v4.h[5] + trn2 v10.4s, v11.4s, v21.4s + mul v22.8h, v16.8h, v4.h[4] + trn1 v28.4s, v11.4s, v21.4s + trn1 v29.2d, v30.2d, v10.2d + ldur q21, [x2, #-48] + mls v0.8h, v1.8h, v7.h[0] + trn2 v20.2d, v8.2d, v28.2d + trn2 v26.2d, v30.2d, v10.2d + str q29, [x0, #16] + trn1 v3.2d, v8.2d, v28.2d + mls v22.8h, v19.8h, v7.h[0] + str q20, [x0, #32] + str q26, [x0, #48] + mls v6.8h, v14.8h, v7.h[0] + str q3, [x0], #64 + sub x4, x4, #0x1 + cbnz x4, layer4567_start + + add v4.8h, v15.8h, v0.8h + sub v14.8h, v15.8h, v0.8h + ldur q26, [x2, #-16] + add v8.8h, v14.8h, v22.8h + sub v27.8h, v14.8h, v22.8h + sub v13.8h, v4.8h, v6.8h + add v29.8h, v4.8h, v6.8h + trn2 v11.4s, v8.4s, v27.4s + trn2 v1.4s, v29.4s, v13.4s + trn1 v27.4s, v8.4s, v27.4s + trn1 v17.4s, v29.4s, v13.4s + trn2 v28.2d, v1.2d, v11.2d + trn1 v16.2d, v1.2d, v11.2d + sqrdmulh v0.8h, v28.8h, v18.8h + trn2 v19.2d, v17.2d, v27.2d + trn1 v1.2d, v17.2d, v27.2d + mul v27.8h, v28.8h, v12.8h + sqrdmulh v9.8h, v19.8h, v18.8h + mls v27.8h, v0.8h, v7.h[0] + mul v20.8h, v19.8h, v12.8h + mls v20.8h, v9.8h, v7.h[0] + sub v30.8h, v16.8h, v27.8h + add v25.8h, v16.8h, v27.8h + sqrdmulh v14.8h, v30.8h, v26.8h + ldur q26, [x2, #-32] + sqrdmulh v31.8h, v25.8h, v21.8h + add v27.8h, v1.8h, v20.8h + mul v15.8h, v30.8h, v26.8h + mls v15.8h, v14.8h, v7.h[0] + sub v26.8h, v1.8h, v20.8h + mul v12.8h, v25.8h, v5.8h + mls v12.8h, v31.8h, v7.h[0] + sub v2.8h, v26.8h, v15.8h + add v0.8h, v26.8h, v15.8h + trn2 v14.4s, v0.4s, v2.4s + sub v26.8h, v27.8h, v12.8h + add v27.8h, v27.8h, v12.8h + trn1 v17.4s, v0.4s, v2.4s + trn1 v5.4s, v27.4s, v26.4s + trn2 v26.4s, v27.4s, v26.4s + trn1 v25.2d, v26.2d, v14.2d + trn1 v0.2d, v5.2d, v17.2d + trn2 v21.2d, v26.2d, v14.2d + trn2 v26.2d, v5.2d, v17.2d + str q25, [x0, #16] + str q0, [x0], #64 + stur q21, [x0, #-16] + stur q26, [x0, #-32] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S new file mode 100644 index 0000000000..704edaa67b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S @@ -0,0 +1,104 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Canonical reduction of polynomial coefficients for ML-KEM +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +// +// This reduces each element of the 256-element array of 16-bit signed +// integers modulo 3329 with the result being 0 <= r < 3329, in-place. +// This is intended for use when that array represents polynomial +// coefficients for ML-KEM, but that is not relevant to its operation. +// +// extern void mlkem_poly_reduce(int16_t a[256]); +// +// Standard ARM ABI: X0 = a +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_reduce) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_reduce) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_poly_reduce): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_reduce_asm.S + + mov w2, #0xd01 + dup v3.8h, w2 + mov w2, #0x4ebf + dup v4.8h, w2 + mov x1, #0x8 + ldr q21, [x0, #0x20] + ldr q23, [x0, #0x30] + sqdmulh v7.8h, v21.8h, v4.h[0] + sqdmulh v30.8h, v23.8h, v4.h[0] + srshr v7.8h, v7.8h, #0xb + srshr v30.8h, v30.8h, #0xb + mls v21.8h, v7.8h, v3.h[0] + mls v23.8h, v30.8h, v3.h[0] + ldr q5, [x0, #0x10] + sshr v7.8h, v21.8h, #0xf + sshr v30.8h, v23.8h, #0xf + and v7.16b, v3.16b, v7.16b + add v21.8h, v21.8h, v7.8h + and v7.16b, v3.16b, v30.16b + add v16.8h, v23.8h, v7.8h + sub x1, x1, #0x1 + +mlkem_poly_reduce_loop: + ldr q6, [x0], #0x40 + ldr q30, [x0, #0x20] + sqdmulh v31.8h, v6.8h, v4.h[0] + sqdmulh v29.8h, v5.8h, v4.h[0] + sqdmulh v22.8h, v30.8h, v4.h[0] + stur q16, [x0, #-0x10] + srshr v20.8h, v31.8h, #0xb + srshr v28.8h, v29.8h, #0xb + stur q21, [x0, #-0x20] + mls v6.8h, v20.8h, v3.h[0] + mls v5.8h, v28.8h, v3.h[0] + ldr q2, [x0, #0x30] + sshr v31.8h, v6.8h, #0xf + srshr v19.8h, v22.8h, #0xb + and v22.16b, v3.16b, v31.16b + add v0.8h, v6.8h, v22.8h + mls v30.8h, v19.8h, v3.h[0] + sshr v26.8h, v5.8h, #0xf + sqdmulh v25.8h, v2.8h, v4.h[0] + and v17.16b, v3.16b, v26.16b + add v1.8h, v5.8h, v17.8h + sshr v31.8h, v30.8h, #0xf + srshr v25.8h, v25.8h, #0xb + stur q1, [x0, #-0x30] + and v18.16b, v3.16b, v31.16b + mls v2.8h, v25.8h, v3.h[0] + add v21.8h, v30.8h, v18.8h + ldr q5, [x0, #0x10] + sshr v18.8h, v2.8h, #0xf + stur q0, [x0, #-0x40] + and v27.16b, v3.16b, v18.16b + add v16.8h, v2.8h, v27.8h + sub x1, x1, #0x1 + cbnz x1, mlkem_poly_reduce_loop + sqdmulh v20.8h, v5.8h, v4.h[0] + ldr q24, [x0], #0x40 + stur q21, [x0, #-0x20] + srshr v20.8h, v20.8h, #0xb + sqdmulh v25.8h, v24.8h, v4.h[0] + stur q16, [x0, #-0x10] + mls v5.8h, v20.8h, v3.h[0] + srshr v20.8h, v25.8h, #0xb + sshr v2.8h, v5.8h, #0xf + mls v24.8h, v20.8h, v3.h[0] + and v20.16b, v3.16b, v2.16b + add v31.8h, v5.8h, v20.8h + sshr v20.8h, v24.8h, #0xf + stur q31, [x0, #-0x30] + and v31.16b, v3.16b, v20.16b + add v24.8h, v24.8h, v31.8h + stur q24, [x0, #-0x40] + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S new file mode 100644 index 0000000000..9335d6367a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S @@ -0,0 +1,115 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Pack ML-KEM polynomial coefficients as 12-bit numbers +// Input a[256] (signed 16-bit words); output r[384] (bytes) +// +// This accepts an array of 256 16-bit numbers assumed to be in the range +// 0 <= a[i] < 2^12 (typically they will be < 3329, the ML-KEM prime). +// It packs them into the output array as 12-bit unsigned numbers. +// +// extern void mlkem_poly_tobytes(uint8_t r[384],const int16_t a[256]); +// +// Standard ARM ABI: X0 = r, X1 = a +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_tobytes) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_tobytes) + .text + .balign 4 + +// This code is essentially a verbatim copy of the mlkem-native version +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_tobytes_asm.S + +S2N_BN_SYMBOL(mlkem_poly_tobytes): + mov x2, #0x10 + ldr q6, [x1], #0x20 + ldur q24, [x1, #-0x10] + ldr q30, [x1], #0x20 + ldur q22, [x1, #-0x10] + ldr q5, [x1], #0x20 + ldur q17, [x1, #-0x10] + ldr q19, [x1], #0x20 + ldur q4, [x1, #-0x10] + lsr x2, x2, #2 + sub x2, x2, #0x1 + +mlkem_poly_tobytes_asm_asm_loop_start: + uzp1 v25.8h, v6.8h, v24.8h + uzp2 v6.8h, v6.8h, v24.8h + xtn v24.8b, v25.8h + shrn v25.8b, v25.8h, #0x8 + xtn v18.8b, v6.8h + shrn v26.8b, v6.8h, #0x4 + sli v25.8b, v18.8b, #0x4 + st3 { v24.8b, v25.8b, v26.8b }, [x0], #24 + uzp1 v25.8h, v30.8h, v22.8h + uzp2 v6.8h, v30.8h, v22.8h + xtn v24.8b, v25.8h + xtn v18.8b, v6.8h + uzp1 v30.8h, v5.8h, v17.8h + uzp2 v22.8h, v5.8h, v17.8h + xtn v5.8b, v30.8h + xtn v17.8b, v22.8h + uzp1 v28.8h, v19.8h, v4.8h + uzp2 v19.8h, v19.8h, v4.8h + xtn v4.8b, v28.8h + xtn v20.8b, v19.8h + shrn v25.8b, v25.8h, #0x8 + sli v25.8b, v18.8b, #0x4 + shrn v26.8b, v6.8h, #0x4 + st3 { v24.8b, v25.8b, v26.8b }, [x0], #24 + shrn v6.8b, v30.8h, #0x8 + sli v6.8b, v17.8b, #0x4 + shrn v7.8b, v22.8h, #0x4 + st3 { v5.8b, v6.8b, v7.8b }, [x0], #24 + shrn v5.8b, v28.8h, #0x8 + shrn v6.8b, v19.8h, #0x4 + sli v5.8b, v20.8b, #0x4 + st3 { v4.8b, v5.8b, v6.8b }, [x0], #24 + ldr q6, [x1], #0x20 + ldur q24, [x1, #-0x10] + ldr q30, [x1], #0x20 + ldur q22, [x1, #-0x10] + ldr q5, [x1], #0x20 + ldur q17, [x1, #-0x10] + ldr q19, [x1], #0x20 + ldur q4, [x1, #-0x10] + sub x2, x2, #0x1 + cbnz x2, mlkem_poly_tobytes_asm_asm_loop_start + uzp1 v25.8h, v30.8h, v22.8h + uzp2 v18.8h, v30.8h, v22.8h + uzp1 v30.8h, v6.8h, v24.8h + uzp2 v6.8h, v6.8h, v24.8h + uzp1 v24.8h, v5.8h, v17.8h + uzp2 v22.8h, v5.8h, v17.8h + uzp1 v5.8h, v19.8h, v4.8h + uzp2 v17.8h, v19.8h, v4.8h + xtn v19.8b, v25.8h + shrn v20.8b, v25.8h, #0x8 + xtn v25.8b, v18.8h + shrn v21.8b, v18.8h, #0x4 + xtn v28.8b, v30.8h + shrn v29.8b, v30.8h, #0x8 + xtn v18.8b, v6.8h + shrn v30.8b, v6.8h, #0x4 + xtn v1.8b, v24.8h + shrn v2.8b, v24.8h, #0x8 + xtn v6.8b, v22.8h + shrn v3.8b, v22.8h, #0x4 + xtn v22.8b, v5.8h + shrn v23.8b, v5.8h, #0x8 + xtn v5.8b, v17.8h + shrn v24.8b, v17.8h, #0x4 + sli v20.8b, v25.8b, #0x4 + sli v29.8b, v18.8b, #0x4 + st3 { v28.8b, v29.8b, v30.8b }, [x0], #24 + st3 { v19.8b, v20.8b, v21.8b }, [x0], #24 + sli v2.8b, v6.8b, #0x4 + st3 { v1.8b, v2.8b, v3.8b }, [x0], #24 + sli v23.8b, v5.8b, #0x4 + st3 { v22.8b, v23.8b, v24.8b }, [x0], #24 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S new file mode 100644 index 0000000000..d3cd763bc0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S @@ -0,0 +1,85 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Conversion of ML-KEM polynomial coefficients to Montgomery form +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +// +// This converts each element of the 256-element array of 16-bit signed +// integers modulo 3329 into Montgomery form, giving a signed result +// satisfying (output[i] == 2^16 * input[i]) (mod 3329), without full +// modular reduction but with |output[i]| < 3329 guaranteed. +// +// extern void mlkem_poly_tomont(int16_t a[256]); +// +// Standard ARM ABI: X0 = a +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_tomont) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_tomont) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_poly_tomont): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_tomont_asm.S + + mov w2, #0xd01 + dup v4.8h, w2 + mov w2, #0x4ebf + dup v5.8h, w2 + mov w2, #-0x414 + dup v2.8h, w2 + mov w2, #-0x2824 + dup v3.8h, w2 + mov x1, #0x8 + ldr q26, [x0, #0x30] + ldr q23, [x0, #0x10] + mul v17.8h, v26.8h, v2.8h + sqrdmulh v7.8h, v26.8h, v3.8h + ldr q27, [x0, #0x20] + sub x1, x1, #0x1 + +mlkem_poly_tomont_loop: + mls v17.8h, v7.8h, v4.h[0] + sqrdmulh v5.8h, v23.8h, v3.8h + ldr q7, [x0], #0x40 + stur q17, [x0, #-0x10] + sqrdmulh v29.8h, v27.8h, v3.8h + sqrdmulh v19.8h, v7.8h, v3.8h + mul v25.8h, v23.8h, v2.8h + mul v0.8h, v7.8h, v2.8h + mul v26.8h, v27.8h, v2.8h + ldr q7, [x0, #0x30] + mls v25.8h, v5.8h, v4.h[0] + ldr q23, [x0, #0x10] + mls v26.8h, v29.8h, v4.h[0] + mls v0.8h, v19.8h, v4.h[0] + stur q25, [x0, #-0x30] + mul v17.8h, v7.8h, v2.8h + sqrdmulh v7.8h, v7.8h, v3.8h + stur q0, [x0, #-0x40] + ldr q27, [x0, #0x20] + stur q26, [x0, #-0x20] + sub x1, x1, #0x1 + cbnz x1, mlkem_poly_tomont_loop + + mls v17.8h, v7.8h, v4.h[0] + sqrdmulh v7.8h, v23.8h, v3.8h + mul v26.8h, v23.8h, v2.8h + sqrdmulh v25.8h, v27.8h, v3.8h + ldr q23, [x0], #0x40 + mul v27.8h, v27.8h, v2.8h + mls v26.8h, v7.8h, v4.h[0] + sqrdmulh v7.8h, v23.8h, v3.8h + mul v23.8h, v23.8h, v2.8h + stur q17, [x0, #-0x10] + mls v27.8h, v25.8h, v4.h[0] + stur q26, [x0, #-0x30] + mls v23.8h, v7.8h, v4.h[0] + stur q27, [x0, #-0x20] + stur q23, [x0, #-0x40] + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S new file mode 100644 index 0000000000..e2ebcf5aee --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S @@ -0,0 +1,208 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Uniform rejection sampling for ML-KEM +// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return +// +// extern uint64_t mlkem_rej_uniform_VARIABLE_TIME +// (int16_t r[S2N_BIGNUM_STATIC 256], +// const uint8_t *buf,uint64_t buflen, +// const uint8_t *table); +// +// Interprets the input buffer as packed 12-bit numbers with a length of +// buflen bytes, assumed to be a multiple of 24. Fills the output array +// with those numbers from the packed buffer that are < 3329, in the order +// of appearance, returning the total number of entries written, with a +// maximum of 256. The table argument is a specific precomputed table of +// constants that is defined in this file (see also our test code): +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/rej_uniform_table.c +// +// Unique (at the moment) among s2n-bignum functions this is *not* a +// constant-time function. The time taken depends not only on the +// buffer size "buflen", but also how many elements of the buffer are +// needed to provide the 256 entries for the output. +// +// Standard ARM ABI: X0 = buf, X1 = r, X2 = buflen, X3 = table +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_rej_uniform_VARIABLE_TIME) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_rej_uniform_VARIABLE_TIME) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_rej_uniform_VARIABLE_TIME): + +// This is almost identical to the code from mlkem-native: +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/rej_uniform_asm.S +// +// The only difference is systematic use of full-length scalar registers +// Xnn instead of the mixed use of 32-bit counterparts Wnn in most +// settings where that is applicable. + + sub sp, sp, #0x240 + mov x7, #0x1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + mov w11, #0xd01 + dup v30.8h, w11 + mov x8, sp + mov x7, x8 + mov w11, #0x0 + eor v16.16b, v16.16b, v16.16b +mlkem_rej_uniform_initial_zero: + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt mlkem_rej_uniform_initial_zero + mov x7, x8 + mov w9, #0x0 + mov w4, #0x100 + cmp x2, #0x30 + b.lo mlkem_rej_uniform_loop48_end + +mlkem_rej_uniform_loop48: + cmp x9, x4 + b.hs mlkem_rej_uniform_memory_copy + sub x2, x2, #0x30 + ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48 + zip1 v4.16b, v0.16b, v1.16b + zip2 v5.16b, v0.16b, v1.16b + zip1 v6.16b, v1.16b, v2.16b + zip2 v7.16b, v1.16b, v2.16b + bic v4.8h, #0xf0, lsl #8 + bic v5.8h, #0xf0, lsl #8 + ushr v6.8h, v6.8h, #0x4 + ushr v7.8h, v7.8h, #0x4 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + zip1 v18.8h, v5.8h, v7.8h + zip2 v19.8h, v5.8h, v7.8h + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + cmhi v6.8h, v30.8h, v18.8h + cmhi v7.8h, v30.8h, v19.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + and v6.16b, v6.16b, v31.16b + and v7.16b, v7.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + uaddlv s22, v6.8h + uaddlv s23, v7.8h + fmov w12, s20 + fmov w13, s21 + fmov w14, s22 + fmov w15, s23 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + ldr q26, [x3, x14, lsl #4] + ldr q27, [x3, x15, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + cnt v6.16b, v6.16b + cnt v7.16b, v7.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + uaddlv s22, v6.8h + uaddlv s23, v7.8h + fmov w12, s20 + fmov w13, s21 + fmov w14, s22 + fmov w15, s23 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + tbl v18.16b, { v18.16b }, v26.16b + tbl v19.16b, { v19.16b }, v27.16b + str q16, [x7] + add x7, x7, x12, lsl #1 + str q17, [x7] + add x7, x7, x13, lsl #1 + str q18, [x7] + add x7, x7, x14, lsl #1 + str q19, [x7] + add x7, x7, x15, lsl #1 + add x12, x12, x13 + add x14, x14, x15 + add x9, x9, x12 + add x9, x9, x14 + cmp x2, #0x30 + b.hs mlkem_rej_uniform_loop48 + +mlkem_rej_uniform_loop48_end: + cmp x9, x4 + b.hs mlkem_rej_uniform_memory_copy + cmp x2, #0x18 + b.lo mlkem_rej_uniform_memory_copy + sub x2, x2, #0x18 + ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24 + zip1 v4.16b, v0.16b, v1.16b + zip1 v5.16b, v1.16b, v2.16b + bic v4.8h, #0xf0, lsl #8 + ushr v5.8h, v5.8h, #0x4 + zip1 v16.8h, v4.8h, v5.8h + zip2 v17.8h, v4.8h, v5.8h + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + str q16, [x7] + add x7, x7, x12, lsl #1 + str q17, [x7] + add x7, x7, x13, lsl #1 + add x9, x9, x12 + add x9, x9, x13 + +mlkem_rej_uniform_memory_copy: + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 + mov x7, x8 + +mlkem_rej_uniform_final_copy: + ldr q16, [x7], #0x40 + ldur q17, [x7, #-0x30] + ldur q18, [x7, #-0x20] + ldur q19, [x7, #-0x10] + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt mlkem_rej_uniform_final_copy + mov x0, x9 + b mlkem_rej_uniform_return + +mlkem_rej_uniform_return: + add sp, sp, #0x240 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h index faecfec52a..2d740073fe 100644 --- a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h +++ b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h @@ -978,6 +978,46 @@ extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],con extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]); extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]); +// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache +// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words) +extern void mlkem_basemul_k2(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 512],const int16_t b[S2N_BIGNUM_STATIC 512],const int16_t bt[S2N_BIGNUM_STATIC 256]); + +// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache +// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words) +extern void mlkem_basemul_k3(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 768],const int16_t b[S2N_BIGNUM_STATIC 768],const int16_t bt[S2N_BIGNUM_STATIC 384]); + +// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache +// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words) +extern void mlkem_basemul_k4(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 1024],const int16_t b[S2N_BIGNUM_STATIC 1024],const int16_t bt[S2N_BIGNUM_STATIC 512]); + +// Inverse number-theoretic transform from ML-KEM +// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_intt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]); + +// Precompute the mulcache data for a polynomial in the NTT domain +// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words) +extern void mlkem_mulcache_compute(int16_t x[S2N_BIGNUM_STATIC 128],const int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z[S2N_BIGNUM_STATIC 128],const int16_t t[S2N_BIGNUM_STATIC 128]); + +// Forward number-theoretic transform from ML-KEM +// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_ntt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]); + +// Canonical modular reduction of polynomial coefficients for ML-KEM +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_poly_reduce(int16_t a[S2N_BIGNUM_STATIC 256]); + +// Pack ML-KEM polynomial coefficients as 12-bit numbers +// Input a[256] (signed 16-bit words); output r[384] (bytes) +extern void mlkem_poly_tobytes(uint8_t r[S2N_BIGNUM_STATIC 384],const int16_t a[S2N_BIGNUM_STATIC 256]); + +// Conversion of ML-KEM polynomial coefficients to Montgomery form +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_poly_tomont(int16_t a[S2N_BIGNUM_STATIC 256]); + +// Uniform rejection sampling for ML-KEM +// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return +extern uint64_t mlkem_rej_uniform_VARIABLE_TIME(int16_t r[S2N_BIGNUM_STATIC 256],const uint8_t *buf,uint64_t buflen,const uint8_t *table); + // Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates // Inputs p1[12], p2[12]; output p3[12] extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);