diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 47b00c7f8f..049122668d 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -291,6 +291,20 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR ${S2N_BIGNUM_DIR}/generic/bignum_copy_row_from_table_16.S ${S2N_BIGNUM_DIR}/generic/bignum_copy_row_from_table_32.S ) + + # ML-KEM core arithmetic + list(APPEND BCM_ASM_SOURCES + ${S2N_BIGNUM_DIR}/mlkem/mlkem_basemul_k2.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_basemul_k3.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_basemul_k4.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_intt.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_mulcache_compute.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_ntt.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_poly_reduce.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_poly_tobytes.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_poly_tomont.S + ${S2N_BIGNUM_DIR}/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S) + endif() if(BORINGSSL_PREFIX) diff --git a/crypto/fipsmodule/ml_kem/aarch64/README.md b/crypto/fipsmodule/ml_kem/aarch64/README.md new file mode 100644 index 0000000000..c3b66b1973 --- /dev/null +++ b/crypto/fipsmodule/ml_kem/aarch64/README.md @@ -0,0 +1 @@ +This directory contains an AArch64 arithmetic backend for mlkem-native. The core assembly routines are imported from [s2n-bignum](https://github.com/awslabs/s2n-bignum/). \ No newline at end of file diff --git a/crypto/fipsmodule/ml_kem/aarch64/constants.c b/crypto/fipsmodule/ml_kem/aarch64/constants.c new file mode 100644 index 0000000000..53d0098bf6 --- /dev/null +++ b/crypto/fipsmodule/ml_kem/aarch64/constants.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../internal.h" + +alignas(16) const int16_t mlk_aarch64_ntt_zetas_layer12345[] = { + -1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201, + -1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914, + -882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529, + 0, 0, 193, 1900, -283, -2786, 56, 551, 0, 0, + 797, 7845, -1089, -10719, 1333, 13121, 0, 0, -543, -5345, + 1426, 14036, -1235, -12156, 0, 0, -69, -679, 535, 5266, + -447, -4400, 0, 0, 569, 5601, -936, -9213, -450, -4429, + 0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0, +}; + +alignas(16) const int16_t mlk_aarch64_ntt_zetas_layer67[] = { + 289, 289, 331, 331, -76, -76, -1573, -1573, 2845, + 2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17, + 583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739, + 5739, 16113, 16113, -10247, -10247, -568, -568, -680, -680, + 723, 723, 1100, 1100, -5591, -5591, -6693, -6693, 7117, + 7117, 10828, 10828, 1197, 1197, -1025, -1025, -1052, -1052, + -1274, -1274, 11782, 11782, -10089, -10089, -10355, -10355, -12540, + -12540, 1409, 1409, -48, -48, 756, 756, -314, -314, + 13869, 13869, -472, -472, 7441, 7441, -3091, -3091, -667, + -667, 233, 233, -1173, -1173, -279, -279, -6565, -6565, + 2293, 2293, -11546, -11546, -2746, -2746, 650, 650, -1352, + -1352, -816, -816, 632, 632, 6398, 6398, -13308, -13308, + -8032, -8032, 6221, 6221, -1626, -1626, -540, -540, -1482, + -1482, 1461, 1461, -16005, -16005, -5315, -5315, -14588, -14588, + 14381, 14381, 1651, 1651, -1540, -1540, 952, 952, -642, + -642, 16251, 16251, -15159, -15159, 9371, 9371, -6319, -6319, + -464, -464, 33, 33, 1320, 1320, -1414, -1414, -4567, + -4567, 325, 325, 12993, 12993, -13918, -13918, 939, 939, + -892, -892, 733, 733, 268, 268, 9243, 9243, -8780, + -8780, 7215, 7215, 2638, 2638, -1021, -1021, -941, -941, + -992, -992, 641, 641, -10050, -10050, -9262, -9262, -9764, + -9764, 6309, 6309, -1010, -1010, 1435, 1435, 807, 807, + 452, 452, -9942, -9942, 14125, 14125, 7943, 7943, 4449, + 4449, 1584, 1584, -1292, -1292, 375, 375, -1239, -1239, + 15592, 15592, -12717, -12717, 3691, 3691, -12196, -12196, -1031, + -1031, -109, -109, -780, -780, 1645, 1645, -10148, -10148, + -1073, -1073, -7678, -7678, 16192, 16192, 1438, 1438, -461, + -461, 1534, 1534, -927, -927, 14155, 14155, -4538, -4538, + 15099, 15099, -9125, -9125, 1063, 1063, -556, -556, -1230, + -1230, -863, -863, 10463, 10463, -5473, -5473, -12107, -12107, + -8495, -8495, 319, 319, 757, 757, 561, 561, -735, + -735, 3140, 3140, 7451, 7451, 5522, 5522, -7235, -7235, + -682, -682, -712, -712, 1481, 1481, 648, 648, -6713, + -6713, -7008, -7008, 14578, 14578, 6378, 6378, -525, -525, + 403, 403, 1143, 1143, -554, -554, -5168, -5168, 3967, + 3967, 11251, 11251, -5453, -5453, 1092, 1092, 1026, 1026, + -1179, -1179, 886, 886, 10749, 10749, 10099, 10099, -11605, + -11605, 8721, 8721, -855, -855, -219, -219, 1227, 1227, + 910, 910, -8416, -8416, -2156, -2156, 12078, 12078, 8957, + 8957, -1607, -1607, -1455, -1455, -1219, -1219, 885, 885, + -15818, -15818, -14322, -14322, -11999, -11999, 8711, 8711, 1212, + 1212, 1029, 1029, -394, -394, -1175, -1175, 11930, 11930, + 10129, 10129, -3878, -3878, -11566, -11566, +}; + +alignas(16) const int16_t mlk_aarch64_invntt_zetas_layer12345[] = { + 1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601, + 450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400, + -535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036, + 0, 0, -797, -7845, -1333, -13121, 1089, 10719, 0, 0, + -193, -1900, -56, -551, 283, 2786, 0, 0, 1410, 13879, + -1476, -14529, -1339, -13180, 0, 0, -1062, -10453, 882, 8682, + -296, -2914, 0, 0, 1600, 15749, 40, 394, 749, 7373, + -848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0, +}; + +alignas(16) const int16_t mlk_aarch64_invntt_zetas_layer67[] = { + -910, -910, -1227, -1227, 219, 219, 855, 855, -8957, + -8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175, + 394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878, + 3878, -10129, -10129, -11930, -11930, -885, -885, 1219, 1219, + 1455, 1455, 1607, 1607, -8711, -8711, 11999, 11999, 14322, + 14322, 15818, 15818, -648, -648, -1481, -1481, 712, 712, + 682, 682, -6378, -6378, -14578, -14578, 7008, 7008, 6713, + 6713, -886, -886, 1179, 1179, -1026, -1026, -1092, -1092, + -8721, -8721, 11605, 11605, -10099, -10099, -10749, -10749, 554, + 554, -1143, -1143, -403, -403, 525, 525, 5453, 5453, + -11251, -11251, -3967, -3967, 5168, 5168, 927, 927, -1534, + -1534, 461, 461, -1438, -1438, 9125, 9125, -15099, -15099, + 4538, 4538, -14155, -14155, 735, 735, -561, -561, -757, + -757, -319, -319, 7235, 7235, -5522, -5522, -7451, -7451, + -3140, -3140, 863, 863, 1230, 1230, 556, 556, -1063, + -1063, 8495, 8495, 12107, 12107, 5473, 5473, -10463, -10463, + -452, -452, -807, -807, -1435, -1435, 1010, 1010, -4449, + -4449, -7943, -7943, -14125, -14125, 9942, 9942, -1645, -1645, + 780, 780, 109, 109, 1031, 1031, -16192, -16192, 7678, + 7678, 1073, 1073, 10148, 10148, 1239, 1239, -375, -375, + 1292, 1292, -1584, -1584, 12196, 12196, -3691, -3691, 12717, + 12717, -15592, -15592, 1414, 1414, -1320, -1320, -33, -33, + 464, 464, 13918, 13918, -12993, -12993, -325, -325, 4567, + 4567, -641, -641, 992, 992, 941, 941, 1021, 1021, + -6309, -6309, 9764, 9764, 9262, 9262, 10050, 10050, -268, + -268, -733, -733, 892, 892, -939, -939, -2638, -2638, + -7215, -7215, 8780, 8780, -9243, -9243, -632, -632, 816, + 816, 1352, 1352, -650, -650, -6221, -6221, 8032, 8032, + 13308, 13308, -6398, -6398, 642, 642, -952, -952, 1540, + 1540, -1651, -1651, 6319, 6319, -9371, -9371, 15159, 15159, + -16251, -16251, -1461, -1461, 1482, 1482, 540, 540, 1626, + 1626, -14381, -14381, 14588, 14588, 5315, 5315, 16005, 16005, + 1274, 1274, 1052, 1052, 1025, 1025, -1197, -1197, 12540, + 12540, 10355, 10355, 10089, 10089, -11782, -11782, 279, 279, + 1173, 1173, -233, -233, 667, 667, 2746, 2746, 11546, + 11546, -2293, -2293, 6565, 6565, 314, 314, -756, -756, + 48, 48, -1409, -1409, 3091, 3091, -7441, -7441, 472, + 472, -13869, -13869, 1573, 1573, 76, 76, -331, -331, + -289, -289, 15483, 15483, 748, 748, -3258, -3258, -2845, + -2845, -1100, -1100, -723, -723, 680, 680, 568, 568, + -10828, -10828, -7117, -7117, 6693, 6693, 5591, 5591, 1041, + 1041, -1637, -1637, -583, -583, -17, -17, 10247, 10247, + -16113, -16113, -5739, -5739, -167, -167, +}; + +alignas(16) const int16_t mlk_aarch64_zetas_mulcache_native[] = { + 17, -17, -568, 568, 583, -583, -680, 680, 1637, -1637, 723, + -723, -1041, 1041, 1100, -1100, 1409, -1409, -667, 667, -48, 48, + 233, -233, 756, -756, -1173, 1173, -314, 314, -279, 279, -1626, + 1626, 1651, -1651, -540, 540, -1540, 1540, -1482, 1482, 952, -952, + 1461, -1461, -642, 642, 939, -939, -1021, 1021, -892, 892, -941, + 941, 733, -733, -992, 992, 268, -268, 641, -641, 1584, -1584, + -1031, 1031, -1292, 1292, -109, 109, 375, -375, -780, 780, -1239, + 1239, 1645, -1645, 1063, -1063, 319, -319, -556, 556, 757, -757, + -1230, 1230, 561, -561, -863, 863, -735, 735, -525, 525, 1092, + -1092, 403, -403, 1026, -1026, 1143, -1143, -1179, 1179, -554, 554, + 886, -886, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -1219, + 1219, -394, 394, 885, -885, -1175, 1175, +}; + +alignas(16) const int16_t mlk_aarch64_zetas_mulcache_twisted_native[] = { + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 16113, + -16113, 7117, -7117, -10247, 10247, 10828, -10828, 13869, -13869, + -6565, 6565, -472, 472, 2293, -2293, 7441, -7441, -11546, + 11546, -3091, 3091, -2746, 2746, -16005, 16005, 16251, -16251, + -5315, 5315, -15159, 15159, -14588, 14588, 9371, -9371, 14381, + -14381, -6319, 6319, 9243, -9243, -10050, 10050, -8780, 8780, + -9262, 9262, 7215, -7215, -9764, 9764, 2638, -2638, 6309, + -6309, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, + 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 10463, + -10463, 3140, -3140, -5473, 5473, 7451, -7451, -12107, 12107, + 5522, -5522, -8495, 8495, -7235, 7235, -5168, 5168, 10749, + -10749, 3967, -3967, 10099, -10099, 11251, -11251, -11605, 11605, + -5453, 5453, 8721, -8721, -15818, 15818, 11930, -11930, -14322, + 14322, 10129, -10129, -11999, 11999, -3878, 3878, 8711, -8711, + -11566, 11566, +}; + +alignas(16) const uint8_t mlk_rej_uniform_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 0 */, + 0, 1, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 1 */, + 2, 3, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 2 */, + 0, 1, 2, 3, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 3 */, + 4, 5, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 4 */, + 0, 1, 4, 5, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 5 */, + 2, 3, 4, 5, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 6 */, + 0, 1, 2, 3, 4, 5, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 7 */, + 6, 7, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 8 */, + 0, 1, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 9 */, + 2, 3, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 10 */, + 0, 1, 2, 3, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 11 */, + 4, 5, 6, 7, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 12 */, + 0, 1, 4, 5, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 13 */, + 2, 3, 4, 5, 6, 7, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 14 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 255, 255, 255, 255, 255, 255, 255, 255 /* 15 */, + 8, 9, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 16 */, + 0, 1, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 17 */, + 2, 3, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 18 */, + 0, 1, 2, 3, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 19 */, + 4, 5, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 20 */, + 0, 1, 4, 5, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 21 */, + 2, 3, 4, 5, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 22 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 23 */, + 6, 7, 8, 9, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 24 */, + 0, 1, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 25 */, + 2, 3, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 26 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 27 */, + 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 28 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 29 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 255, 255, 255, 255, 255, 255, 255, 255 /* 30 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 255, 255, 255, 255, 255, 255 /* 31 */, + 10, 11, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 32 */, + 0, 1, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 33 */, + 2, 3, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 34 */, + 0, 1, 2, 3, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 35 */, + 4, 5, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 36 */, + 0, 1, 4, 5, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 37 */, + 2, 3, 4, 5, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 38 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 39 */, + 6, 7, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 40 */, + 0, 1, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 41 */, + 2, 3, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 42 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 43 */, + 4, 5, 6, 7, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 44 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 45 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 46 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 255, 255, 255, 255, 255, 255 /* 47 */, + 8, 9, 10, 11, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 48 */, + 0, 1, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 49 */, + 2, 3, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 50 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 51 */, + 4, 5, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 52 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 53 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 54 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 55 */, + 6, 7, 8, 9, 10, 11, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 56 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 57 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 58 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 59 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 255, 255, 255, 255, 255, 255, 255, 255 /* 60 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 61 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 255, 255, 255, 255, 255, 255 /* 62 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 255, 255, 255, 255 /* 63 */, + 12, 13, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 64 */, + 0, 1, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 65 */, + 2, 3, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 66 */, + 0, 1, 2, 3, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 67 */, + 4, 5, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 68 */, + 0, 1, 4, 5, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 69 */, + 2, 3, 4, 5, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 70 */, + 0, 1, 2, 3, 4, 5, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 71 */, + 6, 7, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 72 */, + 0, 1, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 73 */, + 2, 3, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 74 */, + 0, 1, 2, 3, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 75 */, + 4, 5, 6, 7, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 76 */, + 0, 1, 4, 5, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 77 */, + 2, 3, 4, 5, 6, 7, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 78 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 255, 255, 255, 255, 255, 255 /* 79 */, + 8, 9, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 80 */, + 0, 1, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 81 */, + 2, 3, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 82 */, + 0, 1, 2, 3, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 83 */, + 4, 5, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 84 */, + 0, 1, 4, 5, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 85 */, + 2, 3, 4, 5, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 86 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 87 */, + 6, 7, 8, 9, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 88 */, + 0, 1, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 89 */, + 2, 3, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 90 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 91 */, + 4, 5, 6, 7, 8, 9, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 92 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 93 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 255, 255, 255, 255, 255, 255 /* 94 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 255, 255, 255, 255 /* 95 */, + 10, 11, 12, 13, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 96 */, + 0, 1, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 97 */, + 2, 3, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 98 */, + 0, 1, 2, 3, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 99 */, + 4, 5, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 100 */, + 0, 1, 4, 5, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 101 */, + 2, 3, 4, 5, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 102 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 103 */, + 6, 7, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 104 */, + 0, 1, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 105 */, + 2, 3, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 106 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 107 */, + 4, 5, 6, 7, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 108 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 109 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 110 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 12, 13, 255, 255, 255, 255 /* 111 */, + 8, 9, 10, 11, 12, 13, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 112 */, + 0, 1, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 113 */, + 2, 3, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 114 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 115 */, + 4, 5, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 116 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 117 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 118 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 119 */, + 6, 7, 8, 9, 10, 11, 12, 13, + 255, 255, 255, 255, 255, 255, 255, 255 /* 120 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 121 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 122 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 123 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 255, 255, 255, 255, 255, 255 /* 124 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 125 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 255, 255, 255, 255 /* 126 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 255, 255 /* 127 */, + 14, 15, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 128 */, + 0, 1, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 129 */, + 2, 3, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 130 */, + 0, 1, 2, 3, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 131 */, + 4, 5, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 132 */, + 0, 1, 4, 5, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 133 */, + 2, 3, 4, 5, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 134 */, + 0, 1, 2, 3, 4, 5, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 135 */, + 6, 7, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 136 */, + 0, 1, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 137 */, + 2, 3, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 138 */, + 0, 1, 2, 3, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 139 */, + 4, 5, 6, 7, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 140 */, + 0, 1, 4, 5, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 141 */, + 2, 3, 4, 5, 6, 7, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 142 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 14, 15, 255, 255, 255, 255, 255, 255 /* 143 */, + 8, 9, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 144 */, + 0, 1, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 145 */, + 2, 3, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 146 */, + 0, 1, 2, 3, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 147 */, + 4, 5, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 148 */, + 0, 1, 4, 5, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 149 */, + 2, 3, 4, 5, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 150 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 151 */, + 6, 7, 8, 9, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 152 */, + 0, 1, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 153 */, + 2, 3, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 154 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 155 */, + 4, 5, 6, 7, 8, 9, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 156 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 157 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 14, 15, 255, 255, 255, 255, 255, 255 /* 158 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 14, 15, 255, 255, 255, 255 /* 159 */, + 10, 11, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 160 */, + 0, 1, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 161 */, + 2, 3, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 162 */, + 0, 1, 2, 3, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 163 */, + 4, 5, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 164 */, + 0, 1, 4, 5, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 165 */, + 2, 3, 4, 5, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 166 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 167 */, + 6, 7, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 168 */, + 0, 1, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 169 */, + 2, 3, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 170 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 171 */, + 4, 5, 6, 7, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 172 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 173 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 174 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 14, 15, 255, 255, 255, 255 /* 175 */, + 8, 9, 10, 11, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 176 */, + 0, 1, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 177 */, + 2, 3, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 178 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 179 */, + 4, 5, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 180 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 181 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 182 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 183 */, + 6, 7, 8, 9, 10, 11, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 184 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 185 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 186 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 187 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 14, 15, 255, 255, 255, 255, 255, 255 /* 188 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 189 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 14, 15, 255, 255, 255, 255 /* 190 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 255, 255 /* 191 */, + 12, 13, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 192 */, + 0, 1, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 193 */, + 2, 3, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 194 */, + 0, 1, 2, 3, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 195 */, + 4, 5, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 196 */, + 0, 1, 4, 5, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 197 */, + 2, 3, 4, 5, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 198 */, + 0, 1, 2, 3, 4, 5, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 199 */, + 6, 7, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 200 */, + 0, 1, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 201 */, + 2, 3, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 202 */, + 0, 1, 2, 3, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 203 */, + 4, 5, 6, 7, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 204 */, + 0, 1, 4, 5, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 205 */, + 2, 3, 4, 5, 6, 7, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 206 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 12, 13, 14, 15, 255, 255, 255, 255 /* 207 */, + 8, 9, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 208 */, + 0, 1, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 209 */, + 2, 3, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 210 */, + 0, 1, 2, 3, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 211 */, + 4, 5, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 212 */, + 0, 1, 4, 5, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 213 */, + 2, 3, 4, 5, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 214 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 215 */, + 6, 7, 8, 9, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 216 */, + 0, 1, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 217 */, + 2, 3, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 218 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 219 */, + 4, 5, 6, 7, 8, 9, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 220 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 221 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 12, 13, 14, 15, 255, 255, 255, 255 /* 222 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 12, 13, 14, 15, 255, 255 /* 223 */, + 10, 11, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255 /* 224 */, + 0, 1, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 225 */, + 2, 3, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 226 */, + 0, 1, 2, 3, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 227 */, + 4, 5, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 228 */, + 0, 1, 4, 5, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 229 */, + 2, 3, 4, 5, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 230 */, + 0, 1, 2, 3, 4, 5, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 231 */, + 6, 7, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 232 */, + 0, 1, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 233 */, + 2, 3, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 234 */, + 0, 1, 2, 3, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 235 */, + 4, 5, 6, 7, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 236 */, + 0, 1, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 237 */, + 2, 3, 4, 5, 6, 7, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 238 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 10, 11, 12, 13, 14, 15, 255, 255 /* 239 */, + 8, 9, 10, 11, 12, 13, 14, 15, + 255, 255, 255, 255, 255, 255, 255, 255 /* 240 */, + 0, 1, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 241 */, + 2, 3, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 242 */, + 0, 1, 2, 3, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 243 */, + 4, 5, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 244 */, + 0, 1, 4, 5, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 245 */, + 2, 3, 4, 5, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 246 */, + 0, 1, 2, 3, 4, 5, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 247 */, + 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 255, 255, 255, 255, 255, 255 /* 248 */, + 0, 1, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 249 */, + 2, 3, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 250 */, + 0, 1, 2, 3, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 251 */, + 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 255, 255, 255, 255 /* 252 */, + 0, 1, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 253 */, + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 255, 255 /* 254 */, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 /* 255 */, +}; diff --git a/crypto/fipsmodule/ml_kem/aarch64/meta.h b/crypto/fipsmodule/ml_kem/aarch64/meta.h new file mode 100644 index 0000000000..0da4388fc4 --- /dev/null +++ b/crypto/fipsmodule/ml_kem/aarch64/meta.h @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +#ifndef ML_KEM_AARCH64_BACKEND_H +#define ML_KEM_AARCH64_BACKEND_H + +#include "../mlkem/common.h" + +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT +#define MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE +#define MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED +#define MLK_USE_NATIVE_POLY_TOBYTES +#define MLK_USE_NATIVE_REJ_UNIFORM + +extern const int16_t mlk_aarch64_ntt_zetas_layer12345[]; +extern const int16_t mlk_aarch64_ntt_zetas_layer67[]; +extern const int16_t mlk_aarch64_invntt_zetas_layer12345[]; +extern const int16_t mlk_aarch64_invntt_zetas_layer67[]; +extern const uint8_t mlk_rej_uniform_table[]; +extern const int16_t mlk_aarch64_zetas_mulcache_native[]; +extern const int16_t mlk_aarch64_zetas_mulcache_twisted_native[]; + +#include "../../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" + +static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N]) { + mlkem_ntt(data, mlk_aarch64_ntt_zetas_layer12345, mlk_aarch64_ntt_zetas_layer67); +} + +static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N]) { + mlkem_intt(data, mlk_aarch64_invntt_zetas_layer12345, mlk_aarch64_invntt_zetas_layer67); +} + +static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + mlkem_poly_reduce(data); +} + +static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + mlkem_poly_tomont(data); +} + +static MLK_INLINE void mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N]) { + mlkem_mulcache_compute(x, y, mlk_aarch64_zetas_mulcache_native, + mlk_aarch64_zetas_mulcache_twisted_native); +} + +static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native( + int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N], + const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)]) { + mlkem_basemul_k2(r, a, b, b_cache); +} + +static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native( + int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N], + const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)]) { + mlkem_basemul_k3(r, a, b, b_cache); +} + +static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native( + int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N], + const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)]) { + mlkem_basemul_k4(r, a, b, b_cache); +} + +static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]) { + mlkem_poly_tobytes(r, a); +} + +static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) { + if (len != MLKEM_N || buflen % 24 != 0) { + return -1; + } + return (int) mlkem_rej_uniform_VARIABLE_TIME(r, buf, buflen, mlk_rej_uniform_table); +} + +#endif /* ML_KEM_AARCH64_BACKEND_H */ diff --git a/crypto/fipsmodule/ml_kem/ml_kem.c b/crypto/fipsmodule/ml_kem/ml_kem.c index e2227e0465..ce67d700a2 100644 --- a/crypto/fipsmodule/ml_kem/ml_kem.c +++ b/crypto/fipsmodule/ml_kem/ml_kem.c @@ -26,6 +26,11 @@ #include "./ml_kem.h" +// AArch64 backend +#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_NO_ASM) +#include "aarch64/constants.c" +#endif + typedef struct { uint8_t *buffer; size_t *length; @@ -92,7 +97,7 @@ int ml_kem_512_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */ if (!check_buffer(pkey) || !check_buffer(skey)) { return 1; } - const int res = mlkem512_keypair_derand(pkey.buffer, skey.buffer, seed); + const int res = mlkem_native512_keypair_derand(pkey.buffer, skey.buffer, seed); #if defined(AWSLC_FIPS) /* PCT failure is the only failure condition for key generation. */ if (res != 0) { @@ -110,7 +115,7 @@ int ml_kem_512_keypair(uint8_t *public_key /* OUT */, size_t *secret_len /* IN_OUT */) { output_buffer pkey = {public_key, public_len, MLKEM512_PUBLIC_KEY_BYTES}; output_buffer skey = {secret_key, secret_len, MLKEM512_SECRET_KEY_BYTES}; - return ml_kem_common_keypair(mlkem512_keypair, pkey, skey); + return ml_kem_common_keypair(mlkem_native512_keypair, pkey, skey); } int ml_kem_512_encapsulate_deterministic(uint8_t *ciphertext /* OUT */, @@ -131,7 +136,7 @@ int ml_kem_512_encapsulate_deterministic_no_self_test(uint8_t *ciphertext const uint8_t *seed /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM512_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate_deterministic(mlkem512_enc_derand, ctext, ss, public_key, seed); + return ml_kem_common_encapsulate_deterministic(mlkem_native512_enc_derand, ctext, ss, public_key, seed); } int ml_kem_512_encapsulate(uint8_t *ciphertext /* OUT */, @@ -141,7 +146,7 @@ int ml_kem_512_encapsulate(uint8_t *ciphertext /* OUT */, const uint8_t *public_key /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM512_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate(mlkem512_enc, ctext, ss, public_key); + return ml_kem_common_encapsulate(mlkem_native512_enc, ctext, ss, public_key); } int ml_kem_512_decapsulate(uint8_t *shared_secret /* OUT */, @@ -157,7 +162,7 @@ int ml_kem_512_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */, const uint8_t *ciphertext /* IN */, const uint8_t *secret_key /* IN */) { output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN}; - return ml_kem_common_decapsulate(mlkem512_dec, ss, ciphertext, secret_key); + return ml_kem_common_decapsulate(mlkem_native512_dec, ss, ciphertext, secret_key); } @@ -181,7 +186,7 @@ int ml_kem_768_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */, if (!check_buffer(pkey) || !check_buffer(skey)) { return 1; } - const int res = mlkem768_keypair_derand(pkey.buffer, skey.buffer, seed); + const int res = mlkem_native768_keypair_derand(pkey.buffer, skey.buffer, seed); #if defined(AWSLC_FIPS) /* PCT failure is the only failure condition for key generation. */ if (res != 0) { @@ -199,7 +204,7 @@ int ml_kem_768_keypair(uint8_t *public_key /* OUT */, size_t *secret_len /* IN_OUT */) { output_buffer pkey = {public_key, public_len, MLKEM768_PUBLIC_KEY_BYTES}; output_buffer skey = {secret_key, secret_len, MLKEM768_SECRET_KEY_BYTES}; - return ml_kem_common_keypair(mlkem768_keypair, pkey, skey); + return ml_kem_common_keypair(mlkem_native768_keypair, pkey, skey); } int ml_kem_768_encapsulate_deterministic(uint8_t *ciphertext /* OUT */, @@ -220,7 +225,7 @@ int ml_kem_768_encapsulate_deterministic_no_self_test(uint8_t *ciphertext const uint8_t *seed /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM768_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate_deterministic(mlkem768_enc_derand, ctext, ss, public_key, seed); + return ml_kem_common_encapsulate_deterministic(mlkem_native768_enc_derand, ctext, ss, public_key, seed); } int ml_kem_768_encapsulate(uint8_t *ciphertext /* OUT */, @@ -230,7 +235,7 @@ int ml_kem_768_encapsulate(uint8_t *ciphertext /* OUT */, const uint8_t *public_key /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM768_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate(mlkem768_enc, ctext, ss, public_key); + return ml_kem_common_encapsulate(mlkem_native768_enc, ctext, ss, public_key); } int ml_kem_768_decapsulate(uint8_t *shared_secret /* OUT */, @@ -246,7 +251,7 @@ int ml_kem_768_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */, const uint8_t *ciphertext /* IN */, const uint8_t *secret_key /* IN */) { output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN}; - return ml_kem_common_decapsulate(mlkem768_dec, ss, ciphertext, secret_key); + return ml_kem_common_decapsulate(mlkem_native768_dec, ss, ciphertext, secret_key); } int ml_kem_1024_keypair_deterministic(uint8_t *public_key /* OUT */, @@ -268,7 +273,7 @@ int ml_kem_1024_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */ if (!check_buffer(pkey) || !check_buffer(skey)) { return 1; } - const int res = mlkem1024_keypair_derand(pkey.buffer, skey.buffer, seed); + const int res = mlkem_native1024_keypair_derand(pkey.buffer, skey.buffer, seed); #if defined(AWSLC_FIPS) /* PCT failure is the only failure condition for key generation. */ if (res != 0) { @@ -286,7 +291,7 @@ int ml_kem_1024_keypair(uint8_t *public_key /* OUT */, size_t *secret_len /* IN_OUT */) { output_buffer pkey = {public_key, public_len, MLKEM1024_PUBLIC_KEY_BYTES}; output_buffer skey = {secret_key, secret_len, MLKEM1024_SECRET_KEY_BYTES}; - return ml_kem_common_keypair(mlkem1024_keypair, pkey, skey); + return ml_kem_common_keypair(mlkem_native1024_keypair, pkey, skey); } int ml_kem_1024_encapsulate_deterministic(uint8_t *ciphertext /* OUT */, @@ -307,7 +312,7 @@ int ml_kem_1024_encapsulate_deterministic_no_self_test(uint8_t *ciphertext const uint8_t *seed /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM1024_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate_deterministic(mlkem1024_enc_derand, ctext, ss, public_key, seed); + return ml_kem_common_encapsulate_deterministic(mlkem_native1024_enc_derand, ctext, ss, public_key, seed); } int ml_kem_1024_encapsulate(uint8_t *ciphertext /* OUT */, @@ -317,7 +322,7 @@ int ml_kem_1024_encapsulate(uint8_t *ciphertext /* OUT */, const uint8_t *public_key /* IN */) { output_buffer ctext = {ciphertext, ciphertext_len, MLKEM1024_CIPHERTEXT_BYTES}; output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN}; - return ml_kem_common_encapsulate(mlkem1024_enc, ctext, ss, public_key); + return ml_kem_common_encapsulate(mlkem_native1024_enc, ctext, ss, public_key); } int ml_kem_1024_decapsulate(uint8_t *shared_secret /* OUT */, @@ -333,7 +338,7 @@ int ml_kem_1024_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */, const uint8_t *ciphertext /* IN */, const uint8_t *secret_key /* IN */) { output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN}; - return ml_kem_common_decapsulate(mlkem1024_dec, ss, ciphertext, secret_key); + return ml_kem_common_decapsulate(mlkem_native1024_dec, ss, ciphertext, secret_key); } int ml_kem_common_keypair(int (*keypair)(uint8_t * public_key, uint8_t *secret_key), diff --git a/crypto/fipsmodule/ml_kem/mlkem_native_config.h b/crypto/fipsmodule/ml_kem/mlkem_native_config.h index aa4f24f721..19e4541293 100644 --- a/crypto/fipsmodule/ml_kem/mlkem_native_config.h +++ b/crypto/fipsmodule/ml_kem/mlkem_native_config.h @@ -9,7 +9,7 @@ // Namespacing: All symbols are of the form mlkem*. Level-specific // symbols are further prefixed with their security level, e.g. // mlkem512*, mlkem768*, mlkem1024*. -#define MLK_CONFIG_NAMESPACE_PREFIX mlkem +#define MLK_CONFIG_NAMESPACE_PREFIX mlkem_native // Replace mlkem-native's FIPS 202 headers with glue code to // AWS-LC's own FIPS 202 implementation. @@ -68,4 +68,10 @@ static MLK_INLINE void mlk_randombytes(void *ptr, size_t len) { #define MLK_CONFIG_NO_ASM #endif +#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH +#define MLK_CONFIG_ARITH_BACKEND_FILE "../aarch64/meta.h" +#endif + #endif // MLkEM_NATIVE_CONFIG_H diff --git a/third_party/s2n-bignum/META.yml b/third_party/s2n-bignum/META.yml index 4949b2bf60..3687f1ad97 100644 --- a/third_party/s2n-bignum/META.yml +++ b/third_party/s2n-bignum/META.yml @@ -1,5 +1,5 @@ name: s2n-bignum-imported -source: awslabs/s2n-bignum.git -commit: 54e1fa5756d6b13961c2f61d90f75426aa25d373 -target: main -imported-at: 2025-04-28T17:22:07+0000 +source: jargh/s2n-bignum-dev.git +commit: ae84a59689cb50ad9b9c6e25cd34037d5b1fb2b4 +target: mlkem +imported-at: 2025-06-23T13:38:02+0000 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile new file mode 100644 index 0000000000..673806915f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/Makefile @@ -0,0 +1,39 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = mlkem_basemul_k2.o \ + mlkem_basemul_k3.o \ + mlkem_basemul_k4.o \ + mlkem_intt.o \ + mlkem_mulcache_compute.o \ + mlkem_ntt.o \ + mlkem_poly_reduce.o \ + mlkem_poly_tobytes.o \ + mlkem_poly_tomont.o \ + mlkem_rej_uniform_VARIABLE_TIME.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct unopt/*.o diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S new file mode 100644 index 0000000000..ff63953c1e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k2.S @@ -0,0 +1,210 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache +// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words) +// +// The inputs a and b are considered as 2-element vectors of linear +// polynomials in the NTT domain (in Montgomery form), and the bt +// argument an analogous 2-element vector of mulcaches for the bi: +// +// a0 = a[0..255], a1 = a[256..511] +// b0 = b[0..255], b1 = b[256..511] +// bt0 = bt[0..127], bt1 = bt[128..255] +// +// Scalar multiplication of those 2-element vectors is performed, +// with base multiplication in Fq[X]/(X^2-zeta^i'), with zeta^i' +// being a power of zeta = 17, with i bit-reversed as used for NTTs, +// making use of the mulcache for optimization. +// +// All input elements are assumed <= 2^12 and the bts are +// assumed to be as computed by mlkem_mulcache_compute. +// +// extern void mlkem_basemul_k2 +// (int16_t r[256],const int16_t a[512],const int16_t b[512], +// const int16_t bt[256]) +// +// Standard ARM ABI: X0 = r, X1 = a, X2 = b, X3 = bt +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_basemul_k2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_basemul_k2) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_basemul_k2): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 + dup v0.8h, w14 + mov w14, #0xcff + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + mov x13, #0x10 + ldr q9, [x4], #0x20 + ldur q5, [x4, #-0x10] + ldr q11, [x5], #0x20 + uzp1 v23.8h, v9.8h, v5.8h + uzp2 v9.8h, v9.8h, v5.8h + ldr q5, [x2], #0x20 + ldur q7, [x5, #-0x10] + ldur q21, [x2, #-0x10] + uzp2 v10.8h, v11.8h, v7.8h + uzp1 v11.8h, v11.8h, v7.8h + uzp1 v7.8h, v5.8h, v21.8h + uzp2 v5.8h, v5.8h, v21.8h + ldr q21, [x1], #0x20 + ldur q25, [x1, #-0x10] + ld1 { v6.8h }, [x3], #16 + uzp1 v26.8h, v21.8h, v25.8h + uzp2 v21.8h, v21.8h, v25.8h + smull v25.4s, v26.4h, v5.4h + smull2 v5.4s, v26.8h, v5.8h + smull v19.4s, v26.4h, v7.4h + smull2 v26.4s, v26.8h, v7.8h + smlal v25.4s, v21.4h, v7.4h + smlal2 v5.4s, v21.8h, v7.8h + smlal v19.4s, v21.4h, v6.4h + smlal2 v26.4s, v21.8h, v6.8h + smlal v25.4s, v23.4h, v10.4h + smlal2 v5.4s, v23.8h, v10.8h + smlal v19.4s, v23.4h, v11.4h + smlal2 v26.4s, v23.8h, v11.8h + ld1 { v23.8h }, [x6], #16 + smlal v25.4s, v9.4h, v11.4h + smlal2 v5.4s, v9.8h, v11.8h + smlal2 v26.4s, v9.8h, v23.8h + smlal v19.4s, v9.4h, v23.4h + ldr q9, [x4], #0x20 + uzp1 v11.8h, v25.8h, v5.8h + uzp1 v23.8h, v19.8h, v26.8h + mul v11.8h, v11.8h, v2.8h + mul v23.8h, v23.8h, v2.8h + ldr q7, [x5], #0x20 + smlal2 v5.4s, v11.8h, v0.8h + smlal v25.4s, v11.4h, v0.4h + ldr q11, [x2], #0x20 + ldur q21, [x2, #-0x10] + ldur q6, [x4, #-0x10] + uzp1 v17.8h, v11.8h, v21.8h + ldr q10, [x1], #0x20 + ldur q29, [x1, #-0x10] + uzp2 v11.8h, v11.8h, v21.8h + uzp1 v13.8h, v9.8h, v6.8h + uzp1 v3.8h, v10.8h, v29.8h + uzp2 v10.8h, v10.8h, v29.8h + smull v12.4s, v3.4h, v11.4h + smull2 v11.4s, v3.8h, v11.8h + ldur q21, [x5, #-0x10] + smlal v12.4s, v10.4h, v17.4h + smlal2 v11.4s, v10.8h, v17.8h + uzp2 v29.8h, v7.8h, v21.8h + uzp1 v15.8h, v7.8h, v21.8h + smlal v12.4s, v13.4h, v29.4h + smlal2 v11.4s, v13.8h, v29.8h + uzp2 v28.8h, v9.8h, v6.8h + smlal2 v26.4s, v23.8h, v0.8h + smlal v12.4s, v28.4h, v15.4h + smlal2 v11.4s, v28.8h, v15.8h + smlal v19.4s, v23.4h, v0.4h + uzp2 v27.8h, v25.8h, v5.8h + smull v23.4s, v3.4h, v17.4h + uzp1 v9.8h, v12.8h, v11.8h + uzp2 v19.8h, v19.8h, v26.8h + mul v14.8h, v9.8h, v2.8h + ld1 { v22.8h }, [x6], #16 + zip2 v9.8h, v19.8h, v27.8h + smlal2 v11.4s, v14.8h, v0.8h + ld1 { v4.8h }, [x3], #16 + sub x13, x13, #0x2 + +mlkem_basemul_k2_loop: + smull2 v20.4s, v3.8h, v17.8h + ldr q18, [x4], #0x20 + ldr q30, [x5], #0x20 + smlal2 v20.4s, v10.8h, v4.8h + smlal v12.4s, v14.4h, v0.4h + smlal v23.4s, v10.4h, v4.4h + str q9, [x0, #0x10] + smlal2 v20.4s, v13.8h, v15.8h + ldr q8, [x2], #0x20 + smlal v23.4s, v13.4h, v15.4h + smlal2 v20.4s, v28.8h, v22.8h + zip1 v26.8h, v19.8h, v27.8h + ldur q9, [x2, #-0x10] + smlal v23.4s, v28.4h, v22.4h + uzp2 v27.8h, v12.8h, v11.8h + uzp1 v17.8h, v8.8h, v9.8h + uzp2 v4.8h, v8.8h, v9.8h + uzp1 v5.8h, v23.8h, v20.8h + str q26, [x0], #0x20 + mul v31.8h, v5.8h, v2.8h + ldur q19, [x4, #-0x10] + ldr q29, [x1], #0x20 + ldur q12, [x1, #-0x10] + smlal2 v20.4s, v31.8h, v0.8h + uzp1 v13.8h, v18.8h, v19.8h + uzp1 v3.8h, v29.8h, v12.8h + uzp2 v10.8h, v29.8h, v12.8h + smull v12.4s, v3.4h, v4.4h + smull2 v11.4s, v3.8h, v4.8h + ldur q5, [x5, #-0x10] + smlal v12.4s, v10.4h, v17.4h + smlal2 v11.4s, v10.8h, v17.8h + uzp2 v14.8h, v30.8h, v5.8h + uzp1 v15.8h, v30.8h, v5.8h + smlal v12.4s, v13.4h, v14.4h + smlal2 v11.4s, v13.8h, v14.8h + uzp2 v28.8h, v18.8h, v19.8h + smlal v23.4s, v31.4h, v0.4h + smlal v12.4s, v28.4h, v15.4h + smlal2 v11.4s, v28.8h, v15.8h + ld1 { v22.8h }, [x6], #16 + uzp2 v19.8h, v23.8h, v20.8h + uzp1 v1.8h, v12.8h, v11.8h + smull v23.4s, v3.4h, v17.4h + mul v14.8h, v1.8h, v2.8h + zip2 v9.8h, v19.8h, v27.8h + ld1 { v4.8h }, [x3], #16 + smlal2 v11.4s, v14.8h, v0.8h + sub x13, x13, #0x1 + cbnz x13, mlkem_basemul_k2_loop + smull2 v5.4s, v3.8h, v17.8h + smlal v12.4s, v14.4h, v0.4h + smlal v23.4s, v10.4h, v4.4h + str q9, [x0, #0x10] + smlal2 v5.4s, v10.8h, v4.8h + uzp2 v11.8h, v12.8h, v11.8h + zip1 v9.8h, v19.8h, v27.8h + smlal v23.4s, v13.4h, v15.4h + smlal2 v5.4s, v13.8h, v15.8h + str q9, [x0], #0x20 + smlal v23.4s, v28.4h, v22.4h + smlal2 v5.4s, v28.8h, v22.8h + uzp1 v9.8h, v23.8h, v5.8h + mul v9.8h, v9.8h, v2.8h + smlal2 v5.4s, v9.8h, v0.8h + smlal v23.4s, v9.4h, v0.4h + uzp2 v9.8h, v23.8h, v5.8h + zip2 v5.8h, v9.8h, v11.8h + zip1 v9.8h, v9.8h, v11.8h + str q5, [x0, #0x10] + str q9, [x0], #0x20 + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S new file mode 100644 index 0000000000..9c9d959341 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k3.S @@ -0,0 +1,264 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache +// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words) +// +// The inputs a and b are considered as 3-element vectors of linear +// polynomials in the NTT domain (in Montgomery form), and the bt +// argument an analogous 3-element vector of mulcaches for the bi: +// +// a0 = a[0..255], a1 = a[256..511], a2 = a[512..767] +// b0 = b[0..255], b1 = b[256..511], b2 = b[512..767], +// bt0 = bt[0..127], bt1 = bt[128..255], bt2 = bt[256..383] +// +// Scalar multiplication of those 3-element vectors is performed, +// with base multiplication in Fq[X]/(X^2-zeta^i'), with zeta^i' +// being a power of zeta = 17, with i bit-reversed as used for NTTs, +// making use of the mulcache for optimization. +// +// All input elements are assumed <= 2^12 and the bts are +// assumed to be as computed by mlkem_mulcache_compute. +// +// extern void mlkem_basemul_k3 +// (int16_t r[256],const int16_t a[768],const int16_t b[768], +// const int16_t bt[384]) +// +// Standard ARM ABI: X0 = r, X1 = a, X2 = b, X3 = bt +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_basemul_k3) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_basemul_k3) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_basemul_k3): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 + dup v0.8h, w14 + mov w14, #0xcff + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + add x7, x1, #0x400 + add x8, x2, #0x400 + add x9, x3, #0x200 + mov x13, #0x10 + ldr q7, [x2, #0x10] + ldr q20, [x2], #0x20 + ldr q15, [x1, #0x10] + uzp1 v8.8h, v20.8h, v7.8h + uzp2 v7.8h, v20.8h, v7.8h + ld1 { v20.8h }, [x3], #16 + ldr q30, [x1], #0x20 + ldr q11, [x4], #0x20 + uzp1 v16.8h, v30.8h, v15.8h + uzp2 v15.8h, v30.8h, v15.8h + smull v30.4s, v16.4h, v7.4h + smull2 v7.4s, v16.8h, v7.8h + smull v9.4s, v16.4h, v8.4h + smull2 v16.4s, v16.8h, v8.8h + smlal v30.4s, v15.4h, v8.4h + smlal2 v7.4s, v15.8h, v8.8h + smlal v9.4s, v15.4h, v20.4h + smlal2 v16.4s, v15.8h, v20.8h + ldur q20, [x4, #-0x10] + ldr q15, [x5], #0x20 + uzp1 v8.8h, v11.8h, v20.8h + uzp2 v20.8h, v11.8h, v20.8h + ldur q11, [x5, #-0x10] + ld1 { v27.8h }, [x6], #16 + uzp1 v10.8h, v15.8h, v11.8h + uzp2 v15.8h, v15.8h, v11.8h + smlal v9.4s, v8.4h, v10.4h + smlal2 v16.4s, v8.8h, v10.8h + smlal v30.4s, v8.4h, v15.4h + smlal2 v7.4s, v8.8h, v15.8h + smlal v9.4s, v20.4h, v27.4h + smlal2 v16.4s, v20.8h, v27.8h + smlal v30.4s, v20.4h, v10.4h + smlal2 v7.4s, v20.8h, v10.8h + ldr q20, [x7], #0x20 + ldur q15, [x7, #-0x10] + ldr q8, [x8], #0x20 + uzp1 v11.8h, v20.8h, v15.8h + uzp2 v20.8h, v20.8h, v15.8h + ldur q15, [x8, #-0x10] + ld1 { v27.8h }, [x9], #16 + uzp1 v10.8h, v8.8h, v15.8h + uzp2 v15.8h, v8.8h, v15.8h + smlal v9.4s, v11.4h, v10.4h + smlal2 v16.4s, v11.8h, v10.8h + smlal v30.4s, v11.4h, v15.4h + smlal2 v7.4s, v11.8h, v15.8h + smlal v9.4s, v20.4h, v27.4h + smlal2 v16.4s, v20.8h, v27.8h + smlal v30.4s, v20.4h, v10.4h + smlal2 v7.4s, v20.8h, v10.8h + ldr q15, [x2], #0x20 + uzp1 v20.8h, v9.8h, v16.8h + uzp1 v8.8h, v30.8h, v7.8h + mul v20.8h, v20.8h, v2.8h + mul v8.8h, v8.8h, v2.8h + ldr q21, [x4], #0x20 + smlal v9.4s, v20.4h, v0.4h + smlal2 v16.4s, v20.8h, v0.8h + smlal v30.4s, v8.4h, v0.4h + smlal2 v7.4s, v8.8h, v0.8h + ldur q6, [x4, #-0x10] + uzp2 v27.8h, v9.8h, v16.8h + uzp2 v10.8h, v30.8h, v7.8h + ldur q16, [x2, #-0x10] + ldr q30, [x1, #0x10] + ld1 { v9.8h }, [x3], #16 + ldr q1, [x5], #0x20 + ldur q12, [x5, #-0x10] + ld1 { v24.8h }, [x6], #16 + ldr q19, [x7], #0x20 + ldur q31, [x7, #-0x10] + ldr q17, [x8], #0x20 + ldur q18, [x8, #-0x10] + ld1 { v25.8h }, [x9], #16 + sub x13, x13, #0x2 + +mlkem_basemul_k3_loop: + ldr q20, [x1], #0x20 + uzp1 v7.8h, v15.8h, v16.8h + uzp2 v15.8h, v15.8h, v16.8h + uzp1 v8.8h, v20.8h, v30.8h + uzp2 v20.8h, v20.8h, v30.8h + smull v30.4s, v8.4h, v15.4h + smull2 v15.4s, v8.8h, v15.8h + smull v11.4s, v8.4h, v7.4h + smull2 v8.4s, v8.8h, v7.8h + smlal v30.4s, v20.4h, v7.4h + smlal2 v15.4s, v20.8h, v7.8h + smlal v11.4s, v20.4h, v9.4h + smlal2 v8.4s, v20.8h, v9.8h + uzp1 v7.8h, v21.8h, v6.8h + uzp2 v20.8h, v21.8h, v6.8h + uzp1 v16.8h, v1.8h, v12.8h + uzp2 v9.8h, v1.8h, v12.8h + smlal v11.4s, v7.4h, v16.4h + smlal2 v8.4s, v7.8h, v16.8h + smlal v30.4s, v7.4h, v9.4h + smlal2 v15.4s, v7.8h, v9.8h + smlal v11.4s, v20.4h, v24.4h + smlal2 v8.4s, v20.8h, v24.8h + smlal v30.4s, v20.4h, v16.4h + smlal2 v15.4s, v20.8h, v16.8h + uzp1 v7.8h, v19.8h, v31.8h + uzp2 v20.8h, v19.8h, v31.8h + uzp1 v16.8h, v17.8h, v18.8h + uzp2 v9.8h, v17.8h, v18.8h + smlal v11.4s, v7.4h, v16.4h + smlal2 v8.4s, v7.8h, v16.8h + smlal v30.4s, v7.4h, v9.4h + smlal2 v15.4s, v7.8h, v9.8h + smlal v11.4s, v20.4h, v25.4h + smlal2 v8.4s, v20.8h, v25.8h + smlal v30.4s, v20.4h, v16.4h + smlal2 v15.4s, v20.8h, v16.8h + ldr q16, [x2, #0x10] + uzp1 v7.8h, v11.8h, v8.8h + uzp1 v20.8h, v30.8h, v15.8h + mul v7.8h, v7.8h, v2.8h + mul v20.8h, v20.8h, v2.8h + zip2 v9.8h, v27.8h, v10.8h + zip1 v27.8h, v27.8h, v10.8h + smlal v11.4s, v7.4h, v0.4h + smlal2 v8.4s, v7.8h, v0.8h + smlal v30.4s, v20.4h, v0.4h + smlal2 v15.4s, v20.8h, v0.8h + str q27, [x0], #0x20 + uzp2 v27.8h, v11.8h, v8.8h + stur q9, [x0, #-0x10] + uzp2 v10.8h, v30.8h, v15.8h + ldr q30, [x1, #0x10] + ldr q15, [x2], #0x20 + ld1 { v9.8h }, [x3], #16 + ldr q21, [x4], #0x20 + ldur q6, [x4, #-0x10] + ldr q1, [x5], #0x20 + ldur q12, [x5, #-0x10] + ld1 { v24.8h }, [x6], #16 + ldr q19, [x7], #0x20 + ldur q31, [x7, #-0x10] + ldr q17, [x8], #0x20 + ldur q18, [x8, #-0x10] + ld1 { v25.8h }, [x9], #16 + sub x13, x13, #0x1 + cbnz x13, mlkem_basemul_k3_loop + ldr q7, [x1], #0x20 + uzp1 v20.8h, v15.8h, v16.8h + uzp2 v15.8h, v15.8h, v16.8h + uzp1 v23.8h, v7.8h, v30.8h + uzp2 v11.8h, v7.8h, v30.8h + smull2 v8.4s, v23.8h, v20.8h + smull v5.4s, v23.4h, v20.4h + smull2 v30.4s, v23.8h, v15.8h + uzp1 v28.8h, v1.8h, v12.8h + smlal2 v8.4s, v11.8h, v9.8h + smlal v5.4s, v11.4h, v9.4h + uzp1 v3.8h, v21.8h, v6.8h + smull v16.4s, v23.4h, v15.4h + smlal2 v8.4s, v3.8h, v28.8h + smlal v5.4s, v3.4h, v28.4h + uzp2 v29.8h, v21.8h, v6.8h + uzp1 v7.8h, v17.8h, v18.8h + smlal2 v8.4s, v29.8h, v24.8h + uzp1 v14.8h, v19.8h, v31.8h + smlal v16.4s, v11.4h, v20.4h + smlal2 v30.4s, v11.8h, v20.8h + smlal2 v8.4s, v14.8h, v7.8h + uzp2 v20.8h, v1.8h, v12.8h + uzp2 v21.8h, v19.8h, v31.8h + smlal2 v30.4s, v3.8h, v20.8h + smlal v16.4s, v3.4h, v20.4h + smlal v5.4s, v29.4h, v24.4h + uzp2 v9.8h, v17.8h, v18.8h + smlal2 v30.4s, v29.8h, v28.8h + smlal v16.4s, v29.4h, v28.4h + smlal v5.4s, v14.4h, v7.4h + smlal2 v8.4s, v21.8h, v25.8h + smlal2 v30.4s, v14.8h, v9.8h + smlal v16.4s, v14.4h, v9.4h + smlal v5.4s, v21.4h, v25.4h + zip1 v20.8h, v27.8h, v10.8h + smlal2 v30.4s, v21.8h, v7.8h + smlal v16.4s, v21.4h, v7.4h + uzp1 v7.8h, v5.8h, v8.8h + str q20, [x0], #0x20 + mul v15.8h, v7.8h, v2.8h + uzp1 v7.8h, v16.8h, v30.8h + zip2 v31.8h, v27.8h, v10.8h + mul v20.8h, v7.8h, v2.8h + smlal v5.4s, v15.4h, v0.4h + smlal2 v8.4s, v15.8h, v0.8h + stur q31, [x0, #-0x10] + smlal2 v30.4s, v20.8h, v0.8h + smlal v16.4s, v20.4h, v0.4h + uzp2 v15.8h, v5.8h, v8.8h + uzp2 v20.8h, v16.8h, v30.8h + zip1 v7.8h, v15.8h, v20.8h + zip2 v20.8h, v15.8h, v20.8h + str q7, [x0], #0x20 + stur q20, [x0, #-0x10] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S new file mode 100644 index 0000000000..177e28f92e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_basemul_k4.S @@ -0,0 +1,318 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache +// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words) +// +// The inputs a and b are considered as 4-element vectors of linear +// polynomials in the NTT domain (in Montgomery form), and the bt +// argument an analogous 4-element vector of mulcaches for the bi: +// +// a0 = a[0..255], a1 = a[256..511], a2 = a[512..767], a3 = a[768..1023] +// b0 = b[0..255], b1 = b[256..511], b2 = b[512..767], b3 = b[768..1023] +// bt0 = bt[0..127], bt1 = bt[128..255], bt2 = bt[256..383], bt3 = bt[384..511] +// +// Scalar multiplication of those 4-element vectors is performed, +// with base multiplication in Fq[X]/(X^2-zeta^i'), with zeta^i' +// being a power of zeta = 17, with i bit-reversed as used for NTTs, +// making use of the mulcache for optimization. +// +// All input elements are assumed <= 2^12 and the bts are +// assumed to be as computed by mlkem_mulcache_compute. +// +// extern void mlkem_basemul_k4 +// (int16_t r[256],const int16_t a[1024],const int16_t b[1024], +// const int16_t bt[512]) +// +// Standard ARM ABI: X0 = r, X1 = a, X2 = b, X3 = bt +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_basemul_k4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_basemul_k4) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_basemul_k4): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 + dup v0.8h, w14 + mov w14, #0xcff + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + add x7, x1, #0x400 + add x8, x2, #0x400 + add x9, x3, #0x200 + add x10, x1, #0x600 + add x11, x2, #0x600 + add x12, x3, #0x300 + mov x13, #0x10 + ldr q23, [x2, #0x10] + ldr q19, [x2], #0x20 + ldr q17, [x5], #0x20 + uzp2 v13.8h, v19.8h, v23.8h + uzp1 v19.8h, v19.8h, v23.8h + ldur q23, [x5, #-0x10] + ldr q30, [x1, #0x10] + uzp2 v9.8h, v17.8h, v23.8h + uzp1 v23.8h, v17.8h, v23.8h + ldr q17, [x1], #0x20 + ldr q10, [x7, #0x10] + uzp1 v12.8h, v17.8h, v30.8h + uzp2 v17.8h, v17.8h, v30.8h + smull2 v30.4s, v12.8h, v13.8h + smull v13.4s, v12.4h, v13.4h + smull2 v22.4s, v12.8h, v19.8h + smull v12.4s, v12.4h, v19.4h + smlal2 v30.4s, v17.8h, v19.8h + smlal v13.4s, v17.4h, v19.4h + ldr q19, [x4], #0x20 + ldur q16, [x4, #-0x10] + ld1 { v8.8h }, [x3], #16 + uzp1 v26.8h, v19.8h, v16.8h + uzp2 v19.8h, v19.8h, v16.8h + smlal2 v30.4s, v26.8h, v9.8h + smlal v13.4s, v26.4h, v9.4h + smlal2 v22.4s, v17.8h, v8.8h + smlal v12.4s, v17.4h, v8.4h + smlal2 v30.4s, v19.8h, v23.8h + smlal v13.4s, v19.4h, v23.4h + smlal2 v22.4s, v26.8h, v23.8h + smlal v12.4s, v26.4h, v23.4h + ldr q23, [x7], #0x20 + ldr q17, [x8, #0x10] + uzp1 v9.8h, v23.8h, v10.8h + uzp2 v23.8h, v23.8h, v10.8h + ldr q10, [x10], #0x20 + ldur q16, [x10, #-0x10] + ld1 { v8.8h }, [x12], #16 + uzp1 v26.8h, v10.8h, v16.8h + uzp2 v10.8h, v10.8h, v16.8h + ld1 { v16.8h }, [x6], #16 + ldr q3, [x11, #0x10] + smlal2 v22.4s, v19.8h, v16.8h + smlal v12.4s, v19.4h, v16.4h + ldr q19, [x11], #0x20 + ld1 { v16.8h }, [x9], #16 + uzp1 v4.8h, v19.8h, v3.8h + uzp2 v19.8h, v19.8h, v3.8h + ldr q3, [x8], #0x20 + ldr q31, [x2], #0x20 + uzp1 v6.8h, v3.8h, v17.8h + uzp2 v17.8h, v3.8h, v17.8h + smlal2 v22.4s, v9.8h, v6.8h + smlal2 v30.4s, v9.8h, v17.8h + smlal v13.4s, v9.4h, v17.4h + smlal v12.4s, v9.4h, v6.4h + smlal2 v22.4s, v23.8h, v16.8h + smlal2 v30.4s, v23.8h, v6.8h + smlal v13.4s, v23.4h, v6.4h + smlal v12.4s, v23.4h, v16.4h + smlal2 v22.4s, v26.8h, v4.8h + smlal2 v30.4s, v26.8h, v19.8h + smlal v13.4s, v26.4h, v19.4h + smlal v12.4s, v26.4h, v4.4h + smlal2 v22.4s, v10.8h, v8.8h + smlal2 v30.4s, v10.8h, v4.8h + smlal v13.4s, v10.4h, v4.4h + smlal v12.4s, v10.4h, v8.4h + ldur q19, [x2, #-0x10] + uzp1 v23.8h, v13.8h, v30.8h + uzp1 v17.8h, v12.8h, v22.8h + mul v23.8h, v23.8h, v2.8h + uzp2 v21.8h, v31.8h, v19.8h + uzp1 v19.8h, v31.8h, v19.8h + mul v17.8h, v17.8h, v2.8h + smlal v13.4s, v23.4h, v0.4h + smlal2 v30.4s, v23.8h, v0.8h + ldr q23, [x5], #0x20 + smlal2 v22.4s, v17.8h, v0.8h + uzp2 v15.8h, v13.8h, v30.8h + smlal v12.4s, v17.4h, v0.4h + ldur q17, [x5, #-0x10] + ldr q13, [x1, #0x10] + uzp2 v27.8h, v23.8h, v17.8h + uzp1 v28.8h, v23.8h, v17.8h + uzp2 v7.8h, v12.8h, v22.8h + ldr q23, [x1], #0x20 + zip1 v5.8h, v7.8h, v15.8h + ldr q3, [x7, #0x10] + uzp1 v31.8h, v23.8h, v13.8h + uzp2 v16.8h, v23.8h, v13.8h + smull2 v24.4s, v31.8h, v21.8h + ldr q6, [x8, #0x10] + ldr q23, [x10], #0x20 + smlal2 v24.4s, v16.8h, v19.8h + ldur q17, [x10, #-0x10] + ld1 { v22.8h }, [x12], #16 + uzp1 v30.8h, v23.8h, v17.8h + uzp2 v11.8h, v23.8h, v17.8h + ldr q23, [x4], #0x20 + ldur q17, [x4, #-0x10] + ldr q4, [x7], #0x20 + uzp1 v20.8h, v23.8h, v17.8h + uzp2 v26.8h, v23.8h, v17.8h + uzp1 v9.8h, v4.8h, v3.8h + smlal2 v24.4s, v20.8h, v27.8h + ld1 { v8.8h }, [x6], #16 + ldr q25, [x11, #0x10] + ldr q29, [x11], #0x20 + ld1 { v12.8h }, [x9], #16 + uzp1 v10.8h, v29.8h, v25.8h + ldr q14, [x8], #0x20 + ld1 { v23.8h }, [x3], #16 + sub x13, x13, #0x2 + +mlkem_basemul_k4_loop: + smlal2 v24.4s, v26.8h, v28.8h + uzp2 v4.8h, v4.8h, v3.8h + smull2 v13.4s, v31.8h, v19.8h + ldr q3, [x2], #0x20 + uzp2 v1.8h, v29.8h, v25.8h + smlal2 v13.4s, v16.8h, v23.8h + ldur q17, [x2, #-0x10] + smull v18.4s, v31.4h, v19.4h + smlal2 v13.4s, v20.8h, v28.8h + smull v29.4s, v31.4h, v21.4h + ldr q21, [x5], #0x20 + smlal2 v13.4s, v26.8h, v8.8h + smlal v29.4s, v16.4h, v19.4h + ldur q19, [x5, #-0x10] + smlal v18.4s, v16.4h, v23.4h + smlal v29.4s, v20.4h, v27.4h + uzp1 v31.8h, v14.8h, v6.8h + uzp2 v27.8h, v21.8h, v19.8h + smlal v18.4s, v20.4h, v28.4h + ldr q25, [x1, #0x10] + smlal v29.4s, v26.4h, v28.4h + smlal v18.4s, v26.4h, v8.4h + uzp2 v26.8h, v14.8h, v6.8h + smlal2 v13.4s, v9.8h, v31.8h + smlal2 v24.4s, v9.8h, v26.8h + smlal v29.4s, v9.4h, v26.4h + smlal v18.4s, v9.4h, v31.4h + smlal2 v13.4s, v4.8h, v12.8h + smlal2 v24.4s, v4.8h, v31.8h + smlal v29.4s, v4.4h, v31.4h + smlal v18.4s, v4.4h, v12.4h + smlal2 v13.4s, v30.8h, v10.8h + smlal2 v24.4s, v30.8h, v1.8h + smlal v29.4s, v30.4h, v1.4h + smlal v18.4s, v30.4h, v10.4h + smlal2 v13.4s, v11.8h, v22.8h + smlal2 v24.4s, v11.8h, v10.8h + smlal v29.4s, v11.4h, v10.4h + smlal v18.4s, v11.4h, v22.4h + ldr q22, [x1], #0x20 + uzp1 v31.8h, v29.8h, v24.8h + uzp1 v28.8h, v21.8h, v19.8h + mul v19.8h, v31.8h, v2.8h + uzp1 v31.8h, v22.8h, v25.8h + uzp2 v16.8h, v22.8h, v25.8h + uzp2 v21.8h, v3.8h, v17.8h + smlal v29.4s, v19.4h, v0.4h + smlal2 v24.4s, v19.8h, v0.8h + uzp1 v19.8h, v3.8h, v17.8h + uzp1 v26.8h, v18.8h, v13.8h + zip2 v14.8h, v7.8h, v15.8h + mul v23.8h, v26.8h, v2.8h + uzp2 v15.8h, v29.8h, v24.8h + smull2 v24.4s, v31.8h, v21.8h + str q14, [x0, #0x10] + ldr q3, [x7, #0x10] + ldr q6, [x8, #0x10] + ldr q8, [x10], #0x20 + ldur q26, [x10, #-0x10] + ld1 { v22.8h }, [x12], #16 + uzp1 v30.8h, v8.8h, v26.8h + uzp2 v11.8h, v8.8h, v26.8h + ldr q8, [x4], #0x20 + ldur q26, [x4, #-0x10] + ldr q4, [x7], #0x20 + uzp1 v20.8h, v8.8h, v26.8h + uzp2 v26.8h, v8.8h, v26.8h + ld1 { v8.8h }, [x6], #16 + uzp1 v9.8h, v4.8h, v3.8h + ldr q25, [x11, #0x10] + ldr q29, [x11], #0x20 + ld1 { v12.8h }, [x9], #16 + ldr q14, [x8], #0x20 + smlal2 v24.4s, v16.8h, v19.8h + smlal2 v13.4s, v23.8h, v0.8h + smlal v18.4s, v23.4h, v0.4h + ld1 { v23.8h }, [x3], #16 + smlal2 v24.4s, v20.8h, v27.8h + uzp2 v7.8h, v18.8h, v13.8h + uzp1 v10.8h, v29.8h, v25.8h + str q5, [x0], #0x20 + zip1 v5.8h, v7.8h, v15.8h + sub x13, x13, #0x1 + cbnz x13, mlkem_basemul_k4_loop + smull2 v17.4s, v31.8h, v19.8h + uzp2 v1.8h, v14.8h, v6.8h + smull v18.4s, v31.4h, v21.4h + smlal2 v24.4s, v26.8h, v28.8h + smlal2 v17.4s, v16.8h, v23.8h + smull v21.4s, v31.4h, v19.4h + smlal v18.4s, v16.4h, v19.4h + uzp2 v31.8h, v4.8h, v3.8h + uzp1 v3.8h, v14.8h, v6.8h + smlal v21.4s, v16.4h, v23.4h + smlal v18.4s, v20.4h, v27.4h + uzp2 v14.8h, v29.8h, v25.8h + smlal2 v17.4s, v20.8h, v28.8h + smlal v21.4s, v20.4h, v28.4h + smlal v18.4s, v26.4h, v28.4h + smlal2 v24.4s, v9.8h, v1.8h + smlal2 v17.4s, v26.8h, v8.8h + smlal v21.4s, v26.4h, v8.4h + smlal v18.4s, v9.4h, v1.4h + smlal2 v24.4s, v31.8h, v3.8h + smlal2 v17.4s, v9.8h, v3.8h + smlal v21.4s, v9.4h, v3.4h + smlal v18.4s, v31.4h, v3.4h + smlal2 v24.4s, v30.8h, v14.8h + smlal2 v17.4s, v31.8h, v12.8h + smlal v21.4s, v31.4h, v12.4h + smlal v18.4s, v30.4h, v14.4h + smlal2 v24.4s, v11.8h, v10.8h + smlal2 v17.4s, v30.8h, v10.8h + smlal v21.4s, v30.4h, v10.4h + smlal v18.4s, v11.4h, v10.4h + zip2 v19.8h, v7.8h, v15.8h + smlal2 v17.4s, v11.8h, v22.8h + smlal v21.4s, v11.4h, v22.4h + uzp1 v23.8h, v18.8h, v24.8h + str q19, [x0, #0x10] + mul v19.8h, v23.8h, v2.8h + uzp1 v23.8h, v21.8h, v17.8h + str q5, [x0], #0x20 + mul v26.8h, v23.8h, v2.8h + smlal v18.4s, v19.4h, v0.4h + smlal2 v24.4s, v19.8h, v0.8h + smlal v21.4s, v26.4h, v0.4h + smlal2 v17.4s, v26.8h, v0.8h + uzp2 v13.8h, v18.8h, v24.8h + uzp2 v19.8h, v21.8h, v17.8h + zip1 v23.8h, v19.8h, v13.8h + zip2 v19.8h, v19.8h, v13.8h + str q23, [x0], #0x20 + stur q19, [x0, #-0x10] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S new file mode 100644 index 0000000000..1be62d9502 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_intt.S @@ -0,0 +1,412 @@ +// Copyright (c) 2022 Arm Limited +// Copyright (c) 2022 Hanno Becker +// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Inverse number-theoretic transform from ML-KEM +// Input a[256], z_01234[80], z_56[384] (all signed 16-bit words); output a[256] (signed 16-bit words). +// +// The transform is in-place with input and output a[256], with the input in +// bitreversed order and the output mapped into the Montgomery domain via +// x |-> (2^16 * x) mod 3329. The two other parameters are expected to point to +// tables of constants whose definitions can be found in the mlkem-native +// repo (mlkem/native/aarch64/src/aarch64_zetas.c) or our "tests/test.c". +// +// extern void mlkem_intt(int16_t a[256],const int16_t z_01234[80],const int16_t z_56[384]); +// +// Standard ARM ABI: X0 = a, X1 = z_01234, X2 = z_56 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_intt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_intt) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_intt): + +// This implementation is generated by SLOTHY, set up to optimize for +// the Neoverse N1 microarchitecture, starting from the clean version +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/intt_clean.S +// +// in the mlkem-native repository. + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov w5, #0xd01 + mov v7.h[0], w5 + mov w5, #0x4ebf + mov v7.h[1], w5 + mov w5, #0x200 + dup v29.8h, w5 + mov w5, #0x13b0 + dup v30.8h, w5 + mov x3, x0 + mov x4, #0x8 + +scale_start: + ldr q8, [x3] + ldr q9, [x3, #16] + ldr q10, [x3, #32] + ldr q11, [x3, #48] + sqrdmulh v27.8h, v8.8h, v30.8h + mul v8.8h, v8.8h, v29.8h + mls v8.8h, v27.8h, v7.h[0] + sqrdmulh v27.8h, v9.8h, v30.8h + mul v9.8h, v9.8h, v29.8h + mls v9.8h, v27.8h, v7.h[0] + sqrdmulh v27.8h, v10.8h, v30.8h + mul v10.8h, v10.8h, v29.8h + mls v10.8h, v27.8h, v7.h[0] + sqrdmulh v27.8h, v11.8h, v30.8h + mul v11.8h, v11.8h, v29.8h + mls v11.8h, v27.8h, v7.h[0] + str q8, [x3], #64 + stur q9, [x3, #-48] + stur q10, [x3, #-32] + stur q11, [x3, #-16] + subs x4, x4, #0x1 + cbnz x4, scale_start + + mov x3, x0 + mov x4, #0x8 + ldr q1, [x3, #32] + ldr q18, [x3, #48] + ldr q15, [x3] + ldr q21, [x3, #16] + ldr q3, [x2], #96 + ldur q16, [x2, #-48] + ldr q4, [x1], #16 + ldur q30, [x2, #-32] + trn1 v11.4s, v1.4s, v18.4s + trn2 v18.4s, v1.4s, v18.4s + trn1 v20.4s, v15.4s, v21.4s + trn2 v1.4s, v15.4s, v21.4s + ldur q0, [x2, #-16] + ldur q22, [x2, #-80] + trn1 v8.2d, v20.2d, v11.2d + trn1 v6.2d, v1.2d, v18.2d + trn2 v1.2d, v1.2d, v18.2d + trn2 v21.2d, v20.2d, v11.2d + sub v11.8h, v8.8h, v6.8h + add v20.8h, v8.8h, v6.8h + add v14.8h, v21.8h, v1.8h + sub v15.8h, v21.8h, v1.8h + sqrdmulh v16.8h, v11.8h, v16.8h + ldur q6, [x2, #-64] + sub v18.8h, v20.8h, v14.8h + add v21.8h, v20.8h, v14.8h + sqrdmulh v0.8h, v15.8h, v0.8h + mul v11.8h, v11.8h, v6.8h + mul v1.8h, v15.8h, v30.8h + mls v11.8h, v16.8h, v7.h[0] + mls v1.8h, v0.8h, v7.h[0] + sqrdmulh v0.8h, v18.8h, v22.8h + mul v16.8h, v18.8h, v3.8h + sub v18.8h, v11.8h, v1.8h + add v13.8h, v11.8h, v1.8h + sqrdmulh v11.8h, v18.8h, v22.8h + trn1 v20.4s, v21.4s, v13.4s + trn2 v1.4s, v21.4s, v13.4s + mls v16.8h, v0.8h, v7.h[0] + mul v3.8h, v18.8h, v3.8h + mls v3.8h, v11.8h, v7.h[0] + trn2 v11.4s, v16.4s, v3.4s + trn1 v16.4s, v16.4s, v3.4s + trn2 v21.2d, v1.2d, v11.2d + trn2 v0.2d, v20.2d, v16.2d + trn1 v1.2d, v1.2d, v11.2d + trn1 v11.2d, v20.2d, v16.2d + sub v13.8h, v0.8h, v21.8h + add v29.8h, v0.8h, v21.8h + add v9.8h, v11.8h, v1.8h + sub v23.8h, v11.8h, v1.8h + sqdmulh v1.8h, v29.8h, v7.h[1] + sqdmulh v27.8h, v9.8h, v7.h[1] + sqrdmulh v16.8h, v13.8h, v4.h[5] + srshr v14.8h, v1.8h, #11 + sub x4, x4, #0x1 + +layer3456_start: + mls v29.8h, v14.8h, v7.h[0] + ldr q3, [x3, #96] + ldr q30, [x3, #112] + ldr q20, [x2, #32] + mul v6.8h, v23.8h, v4.h[2] + ldr q24, [x2, #16] + ldr q0, [x3, #64] + ldr q14, [x3, #80] + srshr v25.8h, v27.8h, #11 + mul v15.8h, v13.8h, v4.h[4] + trn1 v18.4s, v3.4s, v30.4s + ldr q28, [x2], #96 + trn2 v5.4s, v3.4s, v30.4s + sqrdmulh v19.8h, v23.8h, v4.h[3] + trn2 v26.4s, v0.4s, v14.4s + trn1 v11.4s, v0.4s, v14.4s + mls v9.8h, v25.8h, v7.h[0] + trn2 v0.2d, v26.2d, v5.2d + ldur q17, [x2, #-16] + mls v15.8h, v16.8h, v7.h[0] + trn2 v23.2d, v11.2d, v18.2d + trn1 v30.2d, v26.2d, v5.2d + ldur q1, [x2, #-32] + mls v6.8h, v19.8h, v7.h[0] + sub v14.8h, v23.8h, v0.8h + trn1 v19.2d, v11.2d, v18.2d + ldur q10, [x2, #-48] + add v31.8h, v23.8h, v0.8h + sqrdmulh v27.8h, v14.8h, v17.8h + add v18.8h, v19.8h, v30.8h + mul v13.8h, v14.8h, v1.8h + sub v22.8h, v19.8h, v30.8h + sub v26.8h, v18.8h, v31.8h + sqrdmulh v10.8h, v22.8h, v10.8h + sub v25.8h, v9.8h, v29.8h + add v9.8h, v9.8h, v29.8h + mls v13.8h, v27.8h, v7.h[0] + add v2.8h, v18.8h, v31.8h + str q9, [x3], #64 + sub v23.8h, v6.8h, v15.8h + add v21.8h, v6.8h, v15.8h + mul v3.8h, v22.8h, v20.8h + mls v3.8h, v10.8h, v7.h[0] + sqrdmulh v20.8h, v26.8h, v24.8h + mul v22.8h, v26.8h, v28.8h + add v12.8h, v3.8h, v13.8h + sub v15.8h, v3.8h, v13.8h + sqrdmulh v13.8h, v23.8h, v4.h[1] + sqrdmulh v0.8h, v15.8h, v24.8h + mul v27.8h, v15.8h, v28.8h + mls v22.8h, v20.8h, v7.h[0] + mls v27.8h, v0.8h, v7.h[0] + sqdmulh v19.8h, v21.8h, v7.h[1] + trn1 v26.4s, v2.4s, v12.4s + mul v10.8h, v23.8h, v4.h[0] + trn2 v2.4s, v2.4s, v12.4s + trn2 v12.4s, v22.4s, v27.4s + trn1 v8.4s, v22.4s, v27.4s + mul v31.8h, v25.8h, v4.h[0] + trn2 v3.2d, v2.2d, v12.2d + sqrdmulh v11.8h, v25.8h, v4.h[1] + trn2 v0.2d, v26.2d, v8.2d + srshr v16.8h, v19.8h, #11 + ldr q4, [x1], #16 + mls v10.8h, v13.8h, v7.h[0] + add v29.8h, v0.8h, v3.8h + trn1 v18.2d, v26.2d, v8.2d + trn1 v20.2d, v2.2d, v12.2d + sqdmulh v15.8h, v29.8h, v7.h[1] + sub v13.8h, v0.8h, v3.8h + mls v21.8h, v16.8h, v7.h[0] + add v9.8h, v18.8h, v20.8h + stur q10, [x3, #-16] + sub v23.8h, v18.8h, v20.8h + mls v31.8h, v11.8h, v7.h[0] + srshr v14.8h, v15.8h, #11 + sqrdmulh v16.8h, v13.8h, v4.h[5] + stur q21, [x3, #-48] + sqdmulh v27.8h, v9.8h, v7.h[1] + stur q31, [x3, #-32] + sub x4, x4, #0x1 + cbnz x4, layer3456_start + + mls v29.8h, v14.8h, v7.h[0] + srshr v1.8h, v27.8h, #11 + mul v11.8h, v13.8h, v4.h[4] + mls v9.8h, v1.8h, v7.h[0] + sqrdmulh v1.8h, v23.8h, v4.h[3] + mul v20.8h, v23.8h, v4.h[2] + sub v21.8h, v9.8h, v29.8h + add v0.8h, v9.8h, v29.8h + mls v11.8h, v16.8h, v7.h[0] + mls v20.8h, v1.8h, v7.h[0] + str q0, [x3], #64 + mul v1.8h, v21.8h, v4.h[0] + sqrdmulh v16.8h, v21.8h, v4.h[1] + add v21.8h, v20.8h, v11.8h + sub v11.8h, v20.8h, v11.8h + sqdmulh v20.8h, v21.8h, v7.h[1] + sqrdmulh v0.8h, v11.8h, v4.h[1] + mul v11.8h, v11.8h, v4.h[0] + srshr v20.8h, v20.8h, #11 + mls v1.8h, v16.8h, v7.h[0] + mls v11.8h, v0.8h, v7.h[0] + mls v21.8h, v20.8h, v7.h[0] + stur q1, [x3, #-32] + stur q11, [x3, #-16] + stur q21, [x3, #-48] + mov x4, #0x4 + ldr q0, [x1], #32 + ldur q1, [x1, #-16] + ldr q6, [x0, #64] + ldr q16, [x0] + ldr q18, [x0, #192] + ldr q27, [x0, #128] + ldr q26, [x0, #320] + ldr q5, [x0, #256] + ldr q4, [x0, #448] + ldr q2, [x0, #384] + add v12.8h, v16.8h, v6.8h + sub v11.8h, v16.8h, v6.8h + add v3.8h, v27.8h, v18.8h + sub v21.8h, v27.8h, v18.8h + sub v18.8h, v5.8h, v26.8h + mul v10.8h, v11.8h, v0.h[6] + add v24.8h, v5.8h, v26.8h + sqrdmulh v27.8h, v18.8h, v1.h[3] + sub v19.8h, v12.8h, v3.8h + add v29.8h, v12.8h, v3.8h + mul v14.8h, v18.8h, v1.h[2] + sub v13.8h, v2.8h, v4.8h + sqrdmulh v31.8h, v21.8h, v1.h[1] + sqrdmulh v26.8h, v11.8h, v0.h[7] + mul v21.8h, v21.8h, v1.h[0] + sub x4, x4, #0x1 + +layer012_start: + mls v14.8h, v27.8h, v7.h[0] + ldr q15, [x0, #16] + ldr q9, [x0, #208] + add v18.8h, v2.8h, v4.8h + mul v17.8h, v13.8h, v1.h[4] + ldr q20, [x0, #80] + ldr q2, [x0, #400] + ldr q5, [x0, #272] + sub v11.8h, v24.8h, v18.8h + sqrdmulh v8.8h, v13.8h, v1.h[5] + ldr q23, [x0, #336] + sqrdmulh v16.8h, v11.8h, v0.h[5] + sub v12.8h, v15.8h, v20.8h + ldr q3, [x0, #144] + add v28.8h, v15.8h, v20.8h + add v4.8h, v24.8h, v18.8h + mul v30.8h, v11.8h, v0.h[4] + sub v20.8h, v5.8h, v23.8h + add v24.8h, v5.8h, v23.8h + mls v17.8h, v8.8h, v7.h[0] + sub v11.8h, v29.8h, v4.8h + mls v30.8h, v16.8h, v7.h[0] + sqrdmulh v27.8h, v20.8h, v1.h[3] + add v16.8h, v14.8h, v17.8h + sub v13.8h, v14.8h, v17.8h + sqrdmulh v23.8h, v19.8h, v0.h[3] + sub v25.8h, v3.8h, v9.8h + add v5.8h, v3.8h, v9.8h + mul v6.8h, v19.8h, v0.h[2] + mul v8.8h, v11.8h, v0.h[0] + mls v10.8h, v26.8h, v7.h[0] + sqrdmulh v26.8h, v12.8h, v0.h[7] + mul v14.8h, v20.8h, v1.h[2] + mul v22.8h, v13.8h, v0.h[4] + mls v21.8h, v31.8h, v7.h[0] + sqrdmulh v9.8h, v11.8h, v0.h[1] + sqrdmulh v20.8h, v13.8h, v0.h[5] + sub v13.8h, v10.8h, v21.8h + add v15.8h, v10.8h, v21.8h + sqrdmulh v31.8h, v25.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[3] + sub v18.8h, v15.8h, v16.8h + add v3.8h, v15.8h, v16.8h + add v4.8h, v29.8h, v4.8h + mls v22.8h, v20.8h, v7.h[0] + sub v19.8h, v28.8h, v5.8h + mls v6.8h, v23.8h, v7.h[0] + str q4, [x0], #16 + mul v29.8h, v13.8h, v0.h[2] + mls v29.8h, v21.8h, v7.h[0] + add v11.8h, v6.8h, v30.8h + mls v8.8h, v9.8h, v7.h[0] + str q11, [x0, #112] + sub v11.8h, v6.8h, v30.8h + sqrdmulh v21.8h, v18.8h, v0.h[1] + sqrdmulh v4.8h, v11.8h, v0.h[1] + str q8, [x0, #240] + sub v16.8h, v29.8h, v22.8h + str q3, [x0, #48] + mul v20.8h, v11.8h, v0.h[0] + sqrdmulh v11.8h, v16.8h, v0.h[1] + mls v20.8h, v4.8h, v7.h[0] + mul v23.8h, v16.8h, v0.h[0] + mls v23.8h, v11.8h, v7.h[0] + str q20, [x0, #368] + mul v11.8h, v18.8h, v0.h[0] + mls v11.8h, v21.8h, v7.h[0] + str q23, [x0, #432] + ldr q4, [x0, #448] + mul v10.8h, v12.8h, v0.h[6] + add v12.8h, v29.8h, v22.8h + add v29.8h, v28.8h, v5.8h + mul v21.8h, v25.8h, v1.h[0] + str q12, [x0, #176] + str q11, [x0, #304] + sub v13.8h, v2.8h, v4.8h + sub x4, x4, #0x1 + cbnz x4, layer012_start + + mls v21.8h, v31.8h, v7.h[0] + add v22.8h, v2.8h, v4.8h + sqrdmulh v15.8h, v13.8h, v1.h[5] + add v8.8h, v24.8h, v22.8h + sub v17.8h, v29.8h, v8.8h + mul v28.8h, v13.8h, v1.h[4] + add v29.8h, v29.8h, v8.8h + sub v13.8h, v24.8h, v22.8h + sqrdmulh v25.8h, v17.8h, v0.h[1] + str q29, [x0], #16 + mls v28.8h, v15.8h, v7.h[0] + mls v10.8h, v26.8h, v7.h[0] + mul v29.8h, v17.8h, v0.h[0] + mls v29.8h, v25.8h, v7.h[0] + mls v14.8h, v27.8h, v7.h[0] + sqrdmulh v20.8h, v13.8h, v0.h[5] + str q29, [x0, #240] + mul v4.8h, v13.8h, v0.h[4] + add v12.8h, v10.8h, v21.8h + add v22.8h, v14.8h, v28.8h + sub v8.8h, v10.8h, v21.8h + sqrdmulh v11.8h, v19.8h, v0.h[3] + add v6.8h, v12.8h, v22.8h + sub v3.8h, v14.8h, v28.8h + mls v4.8h, v20.8h, v7.h[0] + str q6, [x0, #48] + sub v16.8h, v12.8h, v22.8h + mul v12.8h, v19.8h, v0.h[2] + mul v14.8h, v3.8h, v0.h[4] + sqrdmulh v22.8h, v3.8h, v0.h[5] + mls v12.8h, v11.8h, v7.h[0] + mul v20.8h, v8.8h, v0.h[2] + mls v14.8h, v22.8h, v7.h[0] + add v5.8h, v12.8h, v4.8h + sub v21.8h, v12.8h, v4.8h + sqrdmulh v4.8h, v8.8h, v0.h[3] + str q5, [x0, #112] + sqrdmulh v9.8h, v21.8h, v0.h[1] + mul v19.8h, v21.8h, v0.h[0] + mls v20.8h, v4.8h, v7.h[0] + mls v19.8h, v9.8h, v7.h[0] + sqrdmulh v9.8h, v16.8h, v0.h[1] + sub v5.8h, v20.8h, v14.8h + add v4.8h, v20.8h, v14.8h + mul v20.8h, v16.8h, v0.h[0] + str q4, [x0, #176] + sqrdmulh v18.8h, v5.8h, v0.h[1] + str q19, [x0, #368] + mul v23.8h, v5.8h, v0.h[0] + mls v20.8h, v9.8h, v7.h[0] + mls v23.8h, v18.8h, v7.h[0] + str q20, [x0, #304] + str q23, [x0, #432] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S new file mode 100644 index 0000000000..cebd18dcb2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_mulcache_compute.S @@ -0,0 +1,67 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Precompute the mulcache data for a polynomial in the NTT domain +// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words) +// +// The input array a is assumed to represent 128 linear polynomials +// in the NTT domain, p_i = a[2i] + a[2i+1] * X where each p_i is in +// Fq[X]/(X^2-zeta^i'), with zeta^i' being a power of zeta = 17, with i +// bit-reversed as used for NTTs. For each such polynomial, the mulcache +// value is a[2i+1] * zeta^i' (modulo 3329 as usual), a value useful to +// perform base multiplication of polynomials efficiently. The two other +// table arguments z = zetas and t = twisted zetas are expected to point +// to tables of zeta-related constants whose definitions can be found in +// the mlkem-native repo (mlkem/native/aarch64/src/aarch64_zetas.c) or +// our "tests/test.c", as "mulcache_zetas" and "mulcache_zetas_twisted" +// +// extern void mlkem_mulcache_compute +// (int16_t x[128],const int16_t a[256], +// const int16_t z[128],const int16_t t[128]); +// +// Standard ARM ABI: X0 = x, X1 = a, X2 = z, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_mulcache_compute) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_mulcache_compute) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_mulcache_compute): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_mulcache_compute_asm.S + + mov w5, #0xd01 + dup v6.8h, w5 + mov w5, #0x4ebf + dup v7.8h, w5 + mov x4, #0x10 + ldr q1, [x1, #0x10] + ldr q27, [x1], #0x20 + ldr q23, [x2], #0x10 + uzp2 v27.8h, v27.8h, v1.8h + ldr q1, [x3], #0x10 + mul v2.8h, v27.8h, v23.8h + sqrdmulh v27.8h, v27.8h, v1.8h + sub x4, x4, #0x1 + +mlkem_mulcache_compute_loop: + ldr q29, [x1, #0x10] + ldr q21, [x2], #0x10 + mls v2.8h, v27.8h, v6.h[0] + ldr q27, [x1], #0x20 + ldr q7, [x3], #0x10 + uzp2 v28.8h, v27.8h, v29.8h + str q2, [x0], #0x10 + mul v2.8h, v28.8h, v21.8h + sqrdmulh v27.8h, v28.8h, v7.8h + sub x4, x4, #0x1 + cbnz x4, mlkem_mulcache_compute_loop + + mls v2.8h, v27.8h, v6.h[0] + str q2, [x0], #0x10 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S new file mode 100644 index 0000000000..fda5504d7c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_ntt.S @@ -0,0 +1,363 @@ +// Copyright (c) 2022 Arm Limited +// Copyright (c) 2022 Hanno Becker +// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Forward number-theoretic transform from ML-KEM +// Input a[256], z_01234[80], z_56[384] (all signed 16-bit words); output a[256] (signed 16-bit words). +// +// The transform is in-place with input and output a[256], with the output +// in bitreversed order. The two other parameters are expected to point to +// tables of constants whose definitions can be found in the mlkem-native +// repo (mlkem/native/aarch64/src/aarch64_zetas.c) or our "tests/test.c". +// +// extern void mlkem_ntt(int16_t a[256],const int16_t z_01234[80],const int16_t z_56[384]); +// +// Standard ARM ABI: X0 = a, X1 = z_01234, X2 = z_56 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_ntt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_ntt) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_ntt): + +// This implementation is generated by SLOTHY, set up to optimize for +// the Neoverse N1 microarchitecture, starting from the clean version +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/ntt_clean.S +// +// in the mlkem-native repository. + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov w5, #0xd01 + mov v7.h[0], w5 + mov w5, #0x4ebf + mov v7.h[1], w5 + mov x3, x0 + mov x4, #0x4 + ldr q0, [x1], #32 + ldur q1, [x1, #-16] + ldr q26, [x0, #384] + ldr q14, [x0, #256] + ldr q12, [x0, #64] + ldr q4, [x0, #192] + ldr q11, [x0] + ldr q22, [x0, #320] + ldr q10, [x0, #448] + ldr q28, [x0, #128] + sqrdmulh v23.8h, v26.8h, v0.h[1] + mul v26.8h, v26.8h, v0.h[0] + sqrdmulh v24.8h, v14.8h, v0.h[1] + mul v9.8h, v14.8h, v0.h[0] + sqrdmulh v14.8h, v22.8h, v0.h[1] + mul v22.8h, v22.8h, v0.h[0] + sqrdmulh v29.8h, v10.8h, v0.h[1] + mls v22.8h, v14.8h, v7.h[0] + mls v26.8h, v23.8h, v7.h[0] + mul v14.8h, v10.8h, v0.h[0] + add v10.8h, v12.8h, v22.8h + sub v12.8h, v12.8h, v22.8h + mls v14.8h, v29.8h, v7.h[0] + sub v22.8h, v28.8h, v26.8h + add v23.8h, v28.8h, v26.8h + mls v9.8h, v24.8h, v7.h[0] + sqrdmulh v26.8h, v22.8h, v0.h[5] + sub v28.8h, v4.8h, v14.8h + add v14.8h, v4.8h, v14.8h + mul v13.8h, v22.8h, v0.h[4] + sub v18.8h, v11.8h, v9.8h + sqrdmulh v4.8h, v28.8h, v0.h[5] + sqrdmulh v22.8h, v14.8h, v0.h[3] + mul v28.8h, v28.8h, v0.h[4] + mls v28.8h, v4.8h, v7.h[0] + mul v14.8h, v14.8h, v0.h[2] + mls v14.8h, v22.8h, v7.h[0] + add v4.8h, v12.8h, v28.8h + sub v12.8h, v12.8h, v28.8h + mls v13.8h, v26.8h, v7.h[0] + sqrdmulh v26.8h, v4.8h, v1.h[3] + sub v22.8h, v10.8h, v14.8h + add v19.8h, v10.8h, v14.8h + mul v3.8h, v4.8h, v1.h[2] + sqrdmulh v14.8h, v22.8h, v1.h[1] + mul v28.8h, v22.8h, v1.h[0] + mls v3.8h, v26.8h, v7.h[0] + mls v28.8h, v14.8h, v7.h[0] + sqrdmulh v14.8h, v12.8h, v1.h[5] + mul v26.8h, v12.8h, v1.h[4] + sqrdmulh v21.8h, v19.8h, v0.h[7] + mls v26.8h, v14.8h, v7.h[0] + sub x4, x4, #0x1 + +layer123_start: + ldr q17, [x0, #400] + mul v22.8h, v19.8h, v0.h[6] + ldr q24, [x0, #272] + add v19.8h, v18.8h, v13.8h + mls v22.8h, v21.8h, v7.h[0] + sub v21.8h, v18.8h, v13.8h + ldr q13, [x0, #208] + ldr q18, [x0, #80] + sqrdmulh v14.8h, v17.8h, v0.h[1] + ldr q30, [x0, #336] + ldr q4, [x0, #464] + add v2.8h, v11.8h, v9.8h + sqrdmulh v27.8h, v23.8h, v0.h[3] + ldr q11, [x0, #16] + sub v16.8h, v21.8h, v26.8h + ldr q6, [x0, #144] + sub v29.8h, v19.8h, v3.8h + mul v15.8h, v23.8h, v0.h[2] + add v12.8h, v21.8h, v26.8h + sqrdmulh v26.8h, v30.8h, v0.h[1] + add v3.8h, v19.8h, v3.8h + str q29, [x0, #320] + mls v15.8h, v27.8h, v7.h[0] + str q3, [x0, #256] + str q12, [x0, #384] + mul v10.8h, v17.8h, v0.h[0] + str q16, [x0, #448] + mls v10.8h, v14.8h, v7.h[0] + sub v17.8h, v2.8h, v15.8h + mul v12.8h, v30.8h, v0.h[0] + sub v16.8h, v17.8h, v28.8h + mls v12.8h, v26.8h, v7.h[0] + sub v25.8h, v6.8h, v10.8h + str q16, [x0, #192] + sqrdmulh v8.8h, v24.8h, v0.h[1] + add v23.8h, v6.8h, v10.8h + sqrdmulh v14.8h, v4.8h, v0.h[1] + add v16.8h, v18.8h, v12.8h + mul v9.8h, v24.8h, v0.h[0] + sub v5.8h, v18.8h, v12.8h + add v26.8h, v17.8h, v28.8h + mul v21.8h, v4.8h, v0.h[0] + add v27.8h, v2.8h, v15.8h + str q26, [x0, #128] + mls v21.8h, v14.8h, v7.h[0] + sub v12.8h, v27.8h, v22.8h + add v29.8h, v27.8h, v22.8h + mls v9.8h, v8.8h, v7.h[0] + str q12, [x0, #64] + str q29, [x0], #16 + sqrdmulh v18.8h, v25.8h, v0.h[5] + sub v26.8h, v13.8h, v21.8h + add v21.8h, v13.8h, v21.8h + mul v13.8h, v25.8h, v0.h[4] + sqrdmulh v28.8h, v26.8h, v0.h[5] + sqrdmulh v15.8h, v21.8h, v0.h[3] + mul v31.8h, v26.8h, v0.h[4] + mls v31.8h, v28.8h, v7.h[0] + mul v27.8h, v21.8h, v0.h[2] + mls v27.8h, v15.8h, v7.h[0] + add v10.8h, v5.8h, v31.8h + mls v13.8h, v18.8h, v7.h[0] + sub v18.8h, v11.8h, v9.8h + sub v14.8h, v5.8h, v31.8h + sqrdmulh v25.8h, v10.8h, v1.h[3] + sub v26.8h, v16.8h, v27.8h + add v19.8h, v16.8h, v27.8h + mul v3.8h, v10.8h, v1.h[2] + sqrdmulh v29.8h, v26.8h, v1.h[1] + mul v28.8h, v26.8h, v1.h[0] + mls v3.8h, v25.8h, v7.h[0] + mls v28.8h, v29.8h, v7.h[0] + sqrdmulh v4.8h, v14.8h, v1.h[5] + mul v26.8h, v14.8h, v1.h[4] + sqrdmulh v21.8h, v19.8h, v0.h[7] + mls v26.8h, v4.8h, v7.h[0] + sub x4, x4, #0x1 + cbnz x4, layer123_start + + add v20.8h, v18.8h, v13.8h + sqrdmulh v16.8h, v23.8h, v0.h[3] + sub v8.8h, v18.8h, v13.8h + sub v5.8h, v20.8h, v3.8h + mul v27.8h, v23.8h, v0.h[2] + add v31.8h, v11.8h, v9.8h + add v14.8h, v8.8h, v26.8h + sub v26.8h, v8.8h, v26.8h + str q5, [x0, #320] + mls v27.8h, v16.8h, v7.h[0] + str q14, [x0, #384] + mul v25.8h, v19.8h, v0.h[6] + str q26, [x0, #448] + mls v25.8h, v21.8h, v7.h[0] + sub v26.8h, v31.8h, v27.8h + add v15.8h, v20.8h, v3.8h + add v29.8h, v31.8h, v27.8h + sub v22.8h, v26.8h, v28.8h + str q15, [x0, #256] + add v26.8h, v26.8h, v28.8h + sub v27.8h, v29.8h, v25.8h + add v20.8h, v29.8h, v25.8h + str q26, [x0, #128] + str q22, [x0, #192] + str q27, [x0, #64] + str q20, [x0], #16 + mov x0, x3 + mov x4, #0x8 + ldr q11, [x1], #16 + ldr q9, [x0, #48] + ldr q30, [x0, #32] + ldr q12, [x2], #96 + ldur q5, [x2, #-64] + sqrdmulh v23.8h, v9.8h, v11.h[1] + ldr q15, [x0] + mul v6.8h, v9.8h, v11.h[0] + ldr q28, [x0, #16] + sqrdmulh v10.8h, v30.8h, v11.h[1] + mls v6.8h, v23.8h, v7.h[0] + mul v0.8h, v30.8h, v11.h[0] + mls v0.8h, v10.8h, v7.h[0] + ldur q21, [x2, #-48] + add v27.8h, v28.8h, v6.8h + sub v25.8h, v28.8h, v6.8h + mul v6.8h, v27.8h, v11.h[2] + sqrdmulh v17.8h, v27.8h, v11.h[3] + sqrdmulh v19.8h, v25.8h, v11.h[5] + ldur q18, [x2, #-80] + mul v22.8h, v25.8h, v11.h[4] + mls v6.8h, v17.8h, v7.h[0] + mls v22.8h, v19.8h, v7.h[0] + sub x4, x4, #0x1 + +layer4567_start: + ldr q27, [x0, #112] + ldr q4, [x1], #16 + sub v1.8h, v15.8h, v0.8h + ldr q30, [x0, #96] + add v23.8h, v15.8h, v0.8h + add v11.8h, v1.8h, v22.8h + ldur q10, [x2, #-16] + sub v24.8h, v23.8h, v6.8h + sqrdmulh v25.8h, v27.8h, v4.h[1] + add v9.8h, v23.8h, v6.8h + sub v16.8h, v1.8h, v22.8h + mul v31.8h, v27.8h, v4.h[0] + trn2 v14.4s, v9.4s, v24.4s + sqrdmulh v1.8h, v30.8h, v4.h[1] + trn2 v8.4s, v11.4s, v16.4s + trn1 v11.4s, v11.4s, v16.4s + trn2 v19.2d, v14.2d, v8.2d + mls v31.8h, v25.8h, v7.h[0] + ldr q15, [x0, #80] + trn1 v2.4s, v9.4s, v24.4s + sqrdmulh v17.8h, v19.8h, v18.8h + trn1 v9.2d, v14.2d, v8.2d + mul v29.8h, v19.8h, v12.8h + trn2 v27.2d, v2.2d, v11.2d + add v0.8h, v15.8h, v31.8h + sub v16.8h, v15.8h, v31.8h + sqrdmulh v13.8h, v27.8h, v18.8h + mls v29.8h, v17.8h, v7.h[0] + ldur q17, [x2, #-32] + ldr q15, [x0, #64] + mul v24.8h, v27.8h, v12.8h + trn1 v19.2d, v2.2d, v11.2d + ldr q18, [x2, #16] + mul v6.8h, v0.8h, v4.h[2] + ldr q12, [x2], #96 + add v25.8h, v9.8h, v29.8h + mls v24.8h, v13.8h, v7.h[0] + sub v20.8h, v9.8h, v29.8h + sqrdmulh v23.8h, v25.8h, v21.8h + mul v3.8h, v25.8h, v5.8h + add v13.8h, v19.8h, v24.8h + sqrdmulh v27.8h, v20.8h, v10.8h + mls v3.8h, v23.8h, v7.h[0] + sub v29.8h, v19.8h, v24.8h + mul v26.8h, v20.8h, v17.8h + mls v26.8h, v27.8h, v7.h[0] + add v27.8h, v13.8h, v3.8h + sqrdmulh v14.8h, v0.8h, v4.h[3] + sub v19.8h, v13.8h, v3.8h + ldur q5, [x2, #-64] + mul v0.8h, v30.8h, v4.h[0] + trn1 v8.4s, v27.4s, v19.4s + sub v21.8h, v29.8h, v26.8h + add v11.8h, v29.8h, v26.8h + trn2 v30.4s, v27.4s, v19.4s + sqrdmulh v19.8h, v16.8h, v4.h[5] + trn2 v10.4s, v11.4s, v21.4s + mul v22.8h, v16.8h, v4.h[4] + trn1 v28.4s, v11.4s, v21.4s + trn1 v29.2d, v30.2d, v10.2d + ldur q21, [x2, #-48] + mls v0.8h, v1.8h, v7.h[0] + trn2 v20.2d, v8.2d, v28.2d + trn2 v26.2d, v30.2d, v10.2d + str q29, [x0, #16] + trn1 v3.2d, v8.2d, v28.2d + mls v22.8h, v19.8h, v7.h[0] + str q20, [x0, #32] + str q26, [x0, #48] + mls v6.8h, v14.8h, v7.h[0] + str q3, [x0], #64 + sub x4, x4, #0x1 + cbnz x4, layer4567_start + + add v4.8h, v15.8h, v0.8h + sub v14.8h, v15.8h, v0.8h + ldur q26, [x2, #-16] + add v8.8h, v14.8h, v22.8h + sub v27.8h, v14.8h, v22.8h + sub v13.8h, v4.8h, v6.8h + add v29.8h, v4.8h, v6.8h + trn2 v11.4s, v8.4s, v27.4s + trn2 v1.4s, v29.4s, v13.4s + trn1 v27.4s, v8.4s, v27.4s + trn1 v17.4s, v29.4s, v13.4s + trn2 v28.2d, v1.2d, v11.2d + trn1 v16.2d, v1.2d, v11.2d + sqrdmulh v0.8h, v28.8h, v18.8h + trn2 v19.2d, v17.2d, v27.2d + trn1 v1.2d, v17.2d, v27.2d + mul v27.8h, v28.8h, v12.8h + sqrdmulh v9.8h, v19.8h, v18.8h + mls v27.8h, v0.8h, v7.h[0] + mul v20.8h, v19.8h, v12.8h + mls v20.8h, v9.8h, v7.h[0] + sub v30.8h, v16.8h, v27.8h + add v25.8h, v16.8h, v27.8h + sqrdmulh v14.8h, v30.8h, v26.8h + ldur q26, [x2, #-32] + sqrdmulh v31.8h, v25.8h, v21.8h + add v27.8h, v1.8h, v20.8h + mul v15.8h, v30.8h, v26.8h + mls v15.8h, v14.8h, v7.h[0] + sub v26.8h, v1.8h, v20.8h + mul v12.8h, v25.8h, v5.8h + mls v12.8h, v31.8h, v7.h[0] + sub v2.8h, v26.8h, v15.8h + add v0.8h, v26.8h, v15.8h + trn2 v14.4s, v0.4s, v2.4s + sub v26.8h, v27.8h, v12.8h + add v27.8h, v27.8h, v12.8h + trn1 v17.4s, v0.4s, v2.4s + trn1 v5.4s, v27.4s, v26.4s + trn2 v26.4s, v27.4s, v26.4s + trn1 v25.2d, v26.2d, v14.2d + trn1 v0.2d, v5.2d, v17.2d + trn2 v21.2d, v26.2d, v14.2d + trn2 v26.2d, v5.2d, v17.2d + str q25, [x0, #16] + str q0, [x0], #64 + stur q21, [x0, #-16] + stur q26, [x0, #-32] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #0x40 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S new file mode 100644 index 0000000000..704edaa67b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_reduce.S @@ -0,0 +1,104 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Canonical reduction of polynomial coefficients for ML-KEM +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +// +// This reduces each element of the 256-element array of 16-bit signed +// integers modulo 3329 with the result being 0 <= r < 3329, in-place. +// This is intended for use when that array represents polynomial +// coefficients for ML-KEM, but that is not relevant to its operation. +// +// extern void mlkem_poly_reduce(int16_t a[256]); +// +// Standard ARM ABI: X0 = a +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_reduce) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_reduce) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_poly_reduce): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_reduce_asm.S + + mov w2, #0xd01 + dup v3.8h, w2 + mov w2, #0x4ebf + dup v4.8h, w2 + mov x1, #0x8 + ldr q21, [x0, #0x20] + ldr q23, [x0, #0x30] + sqdmulh v7.8h, v21.8h, v4.h[0] + sqdmulh v30.8h, v23.8h, v4.h[0] + srshr v7.8h, v7.8h, #0xb + srshr v30.8h, v30.8h, #0xb + mls v21.8h, v7.8h, v3.h[0] + mls v23.8h, v30.8h, v3.h[0] + ldr q5, [x0, #0x10] + sshr v7.8h, v21.8h, #0xf + sshr v30.8h, v23.8h, #0xf + and v7.16b, v3.16b, v7.16b + add v21.8h, v21.8h, v7.8h + and v7.16b, v3.16b, v30.16b + add v16.8h, v23.8h, v7.8h + sub x1, x1, #0x1 + +mlkem_poly_reduce_loop: + ldr q6, [x0], #0x40 + ldr q30, [x0, #0x20] + sqdmulh v31.8h, v6.8h, v4.h[0] + sqdmulh v29.8h, v5.8h, v4.h[0] + sqdmulh v22.8h, v30.8h, v4.h[0] + stur q16, [x0, #-0x10] + srshr v20.8h, v31.8h, #0xb + srshr v28.8h, v29.8h, #0xb + stur q21, [x0, #-0x20] + mls v6.8h, v20.8h, v3.h[0] + mls v5.8h, v28.8h, v3.h[0] + ldr q2, [x0, #0x30] + sshr v31.8h, v6.8h, #0xf + srshr v19.8h, v22.8h, #0xb + and v22.16b, v3.16b, v31.16b + add v0.8h, v6.8h, v22.8h + mls v30.8h, v19.8h, v3.h[0] + sshr v26.8h, v5.8h, #0xf + sqdmulh v25.8h, v2.8h, v4.h[0] + and v17.16b, v3.16b, v26.16b + add v1.8h, v5.8h, v17.8h + sshr v31.8h, v30.8h, #0xf + srshr v25.8h, v25.8h, #0xb + stur q1, [x0, #-0x30] + and v18.16b, v3.16b, v31.16b + mls v2.8h, v25.8h, v3.h[0] + add v21.8h, v30.8h, v18.8h + ldr q5, [x0, #0x10] + sshr v18.8h, v2.8h, #0xf + stur q0, [x0, #-0x40] + and v27.16b, v3.16b, v18.16b + add v16.8h, v2.8h, v27.8h + sub x1, x1, #0x1 + cbnz x1, mlkem_poly_reduce_loop + sqdmulh v20.8h, v5.8h, v4.h[0] + ldr q24, [x0], #0x40 + stur q21, [x0, #-0x20] + srshr v20.8h, v20.8h, #0xb + sqdmulh v25.8h, v24.8h, v4.h[0] + stur q16, [x0, #-0x10] + mls v5.8h, v20.8h, v3.h[0] + srshr v20.8h, v25.8h, #0xb + sshr v2.8h, v5.8h, #0xf + mls v24.8h, v20.8h, v3.h[0] + and v20.16b, v3.16b, v2.16b + add v31.8h, v5.8h, v20.8h + sshr v20.8h, v24.8h, #0xf + stur q31, [x0, #-0x30] + and v31.16b, v3.16b, v20.16b + add v24.8h, v24.8h, v31.8h + stur q24, [x0, #-0x40] + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S new file mode 100644 index 0000000000..9335d6367a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tobytes.S @@ -0,0 +1,115 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Pack ML-KEM polynomial coefficients as 12-bit numbers +// Input a[256] (signed 16-bit words); output r[384] (bytes) +// +// This accepts an array of 256 16-bit numbers assumed to be in the range +// 0 <= a[i] < 2^12 (typically they will be < 3329, the ML-KEM prime). +// It packs them into the output array as 12-bit unsigned numbers. +// +// extern void mlkem_poly_tobytes(uint8_t r[384],const int16_t a[256]); +// +// Standard ARM ABI: X0 = r, X1 = a +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_tobytes) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_tobytes) + .text + .balign 4 + +// This code is essentially a verbatim copy of the mlkem-native version +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_tobytes_asm.S + +S2N_BN_SYMBOL(mlkem_poly_tobytes): + mov x2, #0x10 + ldr q6, [x1], #0x20 + ldur q24, [x1, #-0x10] + ldr q30, [x1], #0x20 + ldur q22, [x1, #-0x10] + ldr q5, [x1], #0x20 + ldur q17, [x1, #-0x10] + ldr q19, [x1], #0x20 + ldur q4, [x1, #-0x10] + lsr x2, x2, #2 + sub x2, x2, #0x1 + +mlkem_poly_tobytes_asm_asm_loop_start: + uzp1 v25.8h, v6.8h, v24.8h + uzp2 v6.8h, v6.8h, v24.8h + xtn v24.8b, v25.8h + shrn v25.8b, v25.8h, #0x8 + xtn v18.8b, v6.8h + shrn v26.8b, v6.8h, #0x4 + sli v25.8b, v18.8b, #0x4 + st3 { v24.8b, v25.8b, v26.8b }, [x0], #24 + uzp1 v25.8h, v30.8h, v22.8h + uzp2 v6.8h, v30.8h, v22.8h + xtn v24.8b, v25.8h + xtn v18.8b, v6.8h + uzp1 v30.8h, v5.8h, v17.8h + uzp2 v22.8h, v5.8h, v17.8h + xtn v5.8b, v30.8h + xtn v17.8b, v22.8h + uzp1 v28.8h, v19.8h, v4.8h + uzp2 v19.8h, v19.8h, v4.8h + xtn v4.8b, v28.8h + xtn v20.8b, v19.8h + shrn v25.8b, v25.8h, #0x8 + sli v25.8b, v18.8b, #0x4 + shrn v26.8b, v6.8h, #0x4 + st3 { v24.8b, v25.8b, v26.8b }, [x0], #24 + shrn v6.8b, v30.8h, #0x8 + sli v6.8b, v17.8b, #0x4 + shrn v7.8b, v22.8h, #0x4 + st3 { v5.8b, v6.8b, v7.8b }, [x0], #24 + shrn v5.8b, v28.8h, #0x8 + shrn v6.8b, v19.8h, #0x4 + sli v5.8b, v20.8b, #0x4 + st3 { v4.8b, v5.8b, v6.8b }, [x0], #24 + ldr q6, [x1], #0x20 + ldur q24, [x1, #-0x10] + ldr q30, [x1], #0x20 + ldur q22, [x1, #-0x10] + ldr q5, [x1], #0x20 + ldur q17, [x1, #-0x10] + ldr q19, [x1], #0x20 + ldur q4, [x1, #-0x10] + sub x2, x2, #0x1 + cbnz x2, mlkem_poly_tobytes_asm_asm_loop_start + uzp1 v25.8h, v30.8h, v22.8h + uzp2 v18.8h, v30.8h, v22.8h + uzp1 v30.8h, v6.8h, v24.8h + uzp2 v6.8h, v6.8h, v24.8h + uzp1 v24.8h, v5.8h, v17.8h + uzp2 v22.8h, v5.8h, v17.8h + uzp1 v5.8h, v19.8h, v4.8h + uzp2 v17.8h, v19.8h, v4.8h + xtn v19.8b, v25.8h + shrn v20.8b, v25.8h, #0x8 + xtn v25.8b, v18.8h + shrn v21.8b, v18.8h, #0x4 + xtn v28.8b, v30.8h + shrn v29.8b, v30.8h, #0x8 + xtn v18.8b, v6.8h + shrn v30.8b, v6.8h, #0x4 + xtn v1.8b, v24.8h + shrn v2.8b, v24.8h, #0x8 + xtn v6.8b, v22.8h + shrn v3.8b, v22.8h, #0x4 + xtn v22.8b, v5.8h + shrn v23.8b, v5.8h, #0x8 + xtn v5.8b, v17.8h + shrn v24.8b, v17.8h, #0x4 + sli v20.8b, v25.8b, #0x4 + sli v29.8b, v18.8b, #0x4 + st3 { v28.8b, v29.8b, v30.8b }, [x0], #24 + st3 { v19.8b, v20.8b, v21.8b }, [x0], #24 + sli v2.8b, v6.8b, #0x4 + st3 { v1.8b, v2.8b, v3.8b }, [x0], #24 + sli v23.8b, v5.8b, #0x4 + st3 { v22.8b, v23.8b, v24.8b }, [x0], #24 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S new file mode 100644 index 0000000000..d3cd763bc0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_poly_tomont.S @@ -0,0 +1,85 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Conversion of ML-KEM polynomial coefficients to Montgomery form +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +// +// This converts each element of the 256-element array of 16-bit signed +// integers modulo 3329 into Montgomery form, giving a signed result +// satisfying (output[i] == 2^16 * input[i]) (mod 3329), without full +// modular reduction but with |output[i]| < 3329 guaranteed. +// +// extern void mlkem_poly_tomont(int16_t a[256]); +// +// Standard ARM ABI: X0 = a +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_tomont) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_tomont) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_poly_tomont): + +// This matches the code in the mlkem-native repository +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/poly_tomont_asm.S + + mov w2, #0xd01 + dup v4.8h, w2 + mov w2, #0x4ebf + dup v5.8h, w2 + mov w2, #-0x414 + dup v2.8h, w2 + mov w2, #-0x2824 + dup v3.8h, w2 + mov x1, #0x8 + ldr q26, [x0, #0x30] + ldr q23, [x0, #0x10] + mul v17.8h, v26.8h, v2.8h + sqrdmulh v7.8h, v26.8h, v3.8h + ldr q27, [x0, #0x20] + sub x1, x1, #0x1 + +mlkem_poly_tomont_loop: + mls v17.8h, v7.8h, v4.h[0] + sqrdmulh v5.8h, v23.8h, v3.8h + ldr q7, [x0], #0x40 + stur q17, [x0, #-0x10] + sqrdmulh v29.8h, v27.8h, v3.8h + sqrdmulh v19.8h, v7.8h, v3.8h + mul v25.8h, v23.8h, v2.8h + mul v0.8h, v7.8h, v2.8h + mul v26.8h, v27.8h, v2.8h + ldr q7, [x0, #0x30] + mls v25.8h, v5.8h, v4.h[0] + ldr q23, [x0, #0x10] + mls v26.8h, v29.8h, v4.h[0] + mls v0.8h, v19.8h, v4.h[0] + stur q25, [x0, #-0x30] + mul v17.8h, v7.8h, v2.8h + sqrdmulh v7.8h, v7.8h, v3.8h + stur q0, [x0, #-0x40] + ldr q27, [x0, #0x20] + stur q26, [x0, #-0x20] + sub x1, x1, #0x1 + cbnz x1, mlkem_poly_tomont_loop + + mls v17.8h, v7.8h, v4.h[0] + sqrdmulh v7.8h, v23.8h, v3.8h + mul v26.8h, v23.8h, v2.8h + sqrdmulh v25.8h, v27.8h, v3.8h + ldr q23, [x0], #0x40 + mul v27.8h, v27.8h, v2.8h + mls v26.8h, v7.8h, v4.h[0] + sqrdmulh v7.8h, v23.8h, v3.8h + mul v23.8h, v23.8h, v2.8h + stur q17, [x0, #-0x10] + mls v27.8h, v25.8h, v4.h[0] + stur q26, [x0, #-0x30] + mls v23.8h, v7.8h, v4.h[0] + stur q27, [x0, #-0x20] + stur q23, [x0, #-0x40] + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S new file mode 100644 index 0000000000..e2ebcf5aee --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/mlkem/mlkem_rej_uniform_VARIABLE_TIME.S @@ -0,0 +1,208 @@ +// Copyright (c) 2024 The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +// ---------------------------------------------------------------------------- +// Uniform rejection sampling for ML-KEM +// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return +// +// extern uint64_t mlkem_rej_uniform_VARIABLE_TIME +// (int16_t r[S2N_BIGNUM_STATIC 256], +// const uint8_t *buf,uint64_t buflen, +// const uint8_t *table); +// +// Interprets the input buffer as packed 12-bit numbers with a length of +// buflen bytes, assumed to be a multiple of 24. Fills the output array +// with those numbers from the packed buffer that are < 3329, in the order +// of appearance, returning the total number of entries written, with a +// maximum of 256. The table argument is a specific precomputed table of +// constants that is defined in this file (see also our test code): +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/rej_uniform_table.c +// +// Unique (at the moment) among s2n-bignum functions this is *not* a +// constant-time function. The time taken depends not only on the +// buffer size "buflen", but also how many elements of the buffer are +// needed to provide the 256 entries for the output. +// +// Standard ARM ABI: X0 = buf, X1 = r, X2 = buflen, X3 = table +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_rej_uniform_VARIABLE_TIME) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_rej_uniform_VARIABLE_TIME) + .text + .balign 4 + +S2N_BN_SYMBOL(mlkem_rej_uniform_VARIABLE_TIME): + +// This is almost identical to the code from mlkem-native: +// +// https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/src/rej_uniform_asm.S +// +// The only difference is systematic use of full-length scalar registers +// Xnn instead of the mixed use of 32-bit counterparts Wnn in most +// settings where that is applicable. + + sub sp, sp, #0x240 + mov x7, #0x1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + mov w11, #0xd01 + dup v30.8h, w11 + mov x8, sp + mov x7, x8 + mov w11, #0x0 + eor v16.16b, v16.16b, v16.16b +mlkem_rej_uniform_initial_zero: + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt mlkem_rej_uniform_initial_zero + mov x7, x8 + mov w9, #0x0 + mov w4, #0x100 + cmp x2, #0x30 + b.lo mlkem_rej_uniform_loop48_end + +mlkem_rej_uniform_loop48: + cmp x9, x4 + b.hs mlkem_rej_uniform_memory_copy + sub x2, x2, #0x30 + ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48 + zip1 v4.16b, v0.16b, v1.16b + zip2 v5.16b, v0.16b, v1.16b + zip1 v6.16b, v1.16b, v2.16b + zip2 v7.16b, v1.16b, v2.16b + bic v4.8h, #0xf0, lsl #8 + bic v5.8h, #0xf0, lsl #8 + ushr v6.8h, v6.8h, #0x4 + ushr v7.8h, v7.8h, #0x4 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + zip1 v18.8h, v5.8h, v7.8h + zip2 v19.8h, v5.8h, v7.8h + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + cmhi v6.8h, v30.8h, v18.8h + cmhi v7.8h, v30.8h, v19.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + and v6.16b, v6.16b, v31.16b + and v7.16b, v7.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + uaddlv s22, v6.8h + uaddlv s23, v7.8h + fmov w12, s20 + fmov w13, s21 + fmov w14, s22 + fmov w15, s23 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + ldr q26, [x3, x14, lsl #4] + ldr q27, [x3, x15, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + cnt v6.16b, v6.16b + cnt v7.16b, v7.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + uaddlv s22, v6.8h + uaddlv s23, v7.8h + fmov w12, s20 + fmov w13, s21 + fmov w14, s22 + fmov w15, s23 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + tbl v18.16b, { v18.16b }, v26.16b + tbl v19.16b, { v19.16b }, v27.16b + str q16, [x7] + add x7, x7, x12, lsl #1 + str q17, [x7] + add x7, x7, x13, lsl #1 + str q18, [x7] + add x7, x7, x14, lsl #1 + str q19, [x7] + add x7, x7, x15, lsl #1 + add x12, x12, x13 + add x14, x14, x15 + add x9, x9, x12 + add x9, x9, x14 + cmp x2, #0x30 + b.hs mlkem_rej_uniform_loop48 + +mlkem_rej_uniform_loop48_end: + cmp x9, x4 + b.hs mlkem_rej_uniform_memory_copy + cmp x2, #0x18 + b.lo mlkem_rej_uniform_memory_copy + sub x2, x2, #0x18 + ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24 + zip1 v4.16b, v0.16b, v1.16b + zip1 v5.16b, v1.16b, v2.16b + bic v4.8h, #0xf0, lsl #8 + ushr v5.8h, v5.8h, #0x4 + zip1 v16.8h, v4.8h, v5.8h + zip2 v17.8h, v4.8h, v5.8h + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + str q16, [x7] + add x7, x7, x12, lsl #1 + str q17, [x7] + add x7, x7, x13, lsl #1 + add x9, x9, x12 + add x9, x9, x13 + +mlkem_rej_uniform_memory_copy: + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 + mov x7, x8 + +mlkem_rej_uniform_final_copy: + ldr q16, [x7], #0x40 + ldur q17, [x7, #-0x30] + ldur q18, [x7, #-0x20] + ldur q19, [x7, #-0x10] + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt mlkem_rej_uniform_final_copy + mov x0, x9 + b mlkem_rej_uniform_return + +mlkem_rej_uniform_return: + add sp, sp, #0x240 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h index faecfec52a..2d740073fe 100644 --- a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h +++ b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h @@ -978,6 +978,46 @@ extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],con extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]); extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]); +// Scalar product of 2-element polynomial vectors in NTT domain, with mulcache +// Inputs a[512], b[512], bt[256] (signed 16-bit words); output r[256] (signed 16-bit words) +extern void mlkem_basemul_k2(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 512],const int16_t b[S2N_BIGNUM_STATIC 512],const int16_t bt[S2N_BIGNUM_STATIC 256]); + +// Scalar product of 3-element polynomial vectors in NTT domain, with mulcache +// Inputs a[768], b[768], bt[384] (signed 16-bit words); output r[256] (signed 16-bit words) +extern void mlkem_basemul_k3(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 768],const int16_t b[S2N_BIGNUM_STATIC 768],const int16_t bt[S2N_BIGNUM_STATIC 384]); + +// Scalar product of 4-element polynomial vectors in NTT domain, with mulcache +// Inputs a[1024], b[1024], bt[512] (signed 16-bit words); output r[256] (signed 16-bit words) +extern void mlkem_basemul_k4(int16_t r[S2N_BIGNUM_STATIC 256],const int16_t a[S2N_BIGNUM_STATIC 1024],const int16_t b[S2N_BIGNUM_STATIC 1024],const int16_t bt[S2N_BIGNUM_STATIC 512]); + +// Inverse number-theoretic transform from ML-KEM +// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_intt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]); + +// Precompute the mulcache data for a polynomial in the NTT domain +// Inputs a[256], z[128] and t[128] (signed 16-bit words); output x[128] (signed 16-bit words) +extern void mlkem_mulcache_compute(int16_t x[S2N_BIGNUM_STATIC 128],const int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z[S2N_BIGNUM_STATIC 128],const int16_t t[S2N_BIGNUM_STATIC 128]); + +// Forward number-theoretic transform from ML-KEM +// Input a[256] (signed 16-bit words), z_01234[80] (signed 16-bit words), z_56[384] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_ntt(int16_t a[S2N_BIGNUM_STATIC 256],const int16_t z_01234[S2N_BIGNUM_STATIC 80],const int16_t z_56[S2N_BIGNUM_STATIC 384]); + +// Canonical modular reduction of polynomial coefficients for ML-KEM +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_poly_reduce(int16_t a[S2N_BIGNUM_STATIC 256]); + +// Pack ML-KEM polynomial coefficients as 12-bit numbers +// Input a[256] (signed 16-bit words); output r[384] (bytes) +extern void mlkem_poly_tobytes(uint8_t r[S2N_BIGNUM_STATIC 384],const int16_t a[S2N_BIGNUM_STATIC 256]); + +// Conversion of ML-KEM polynomial coefficients to Montgomery form +// Input a[256] (signed 16-bit words); output a[256] (signed 16-bit words) +extern void mlkem_poly_tomont(int16_t a[S2N_BIGNUM_STATIC 256]); + +// Uniform rejection sampling for ML-KEM +// Inputs *buf (unsigned bytes), buflen, table (unsigned bytes); output r[256] (signed 16-bit words), return +extern uint64_t mlkem_rej_uniform_VARIABLE_TIME(int16_t r[S2N_BIGNUM_STATIC 256],const uint8_t *buf,uint64_t buflen,const uint8_t *table); + // Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates // Inputs p1[12], p2[12]; output p3[12] extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);