|
| 1 | +//! Vectorized Carry-less Multiplication (VCLMUL) |
| 2 | +//! |
| 3 | +//! The reference is [Intel 64 and IA-32 Architectures Software Developer's |
| 4 | +//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241). |
| 5 | +//! |
| 6 | +//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf |
| 7 | +
|
| 8 | +use crate::core_arch::x86::__m256i; |
| 9 | +use crate::core_arch::x86::__m512i; |
| 10 | + |
| 11 | +#[cfg(test)] |
| 12 | +use crate::stdarch_test::assert_instr; |
| 13 | + |
| 14 | +#[allow(improper_ctypes)] |
| 15 | +extern "C" { |
| 16 | + #[link_name = "llvm.x86.pclmulqdq.256"] |
| 17 | + fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i; |
| 18 | + #[link_name = "llvm.x86.pclmulqdq.512"] |
| 19 | + fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i; |
| 20 | +} |
| 21 | + |
| 22 | +// for some odd reason on x86_64 we generate the correct long name instructions |
| 23 | +// but on i686 we generate the short name + imm8 |
| 24 | +// so we need to special-case on that... |
| 25 | + |
| 26 | +/// Performs a carry-less multiplication of two 64-bit polynomials over the |
| 27 | +/// finite field GF(2^k) - in each of the 4 128-bit lanes. |
| 28 | +/// |
| 29 | +/// The immediate byte is used for determining which halves of each lane `a` and `b` |
| 30 | +/// should be used. Immediate bits other than 0 and 4 are ignored. |
| 31 | +/// All lanes share immediate byte. |
| 32 | +/// |
| 33 | +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128) |
| 34 | +#[inline] |
| 35 | +#[target_feature(enable = "avx512vpclmulqdq,avx512f")] |
| 36 | +// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise |
| 37 | +#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))] |
| 38 | +#[rustc_args_required_const(2)] |
| 39 | +pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m512i { |
| 40 | + macro_rules! call { |
| 41 | + ($imm8:expr) => { |
| 42 | + pclmulqdq_512(a, b, $imm8) |
| 43 | + }; |
| 44 | + } |
| 45 | + constify_imm8!(imm8, call) |
| 46 | +} |
| 47 | + |
| 48 | +/// Performs a carry-less multiplication of two 64-bit polynomials over the |
| 49 | +/// finite field GF(2^k) - in each of the 2 128-bit lanes. |
| 50 | +/// |
| 51 | +/// The immediate byte is used for determining which halves of each lane `a` and `b` |
| 52 | +/// should be used. Immediate bits other than 0 and 4 are ignored. |
| 53 | +/// All lanes share immediate byte. |
| 54 | +/// |
| 55 | +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128) |
| 56 | +#[inline] |
| 57 | +#[target_feature(enable = "avx512vpclmulqdq,avx512vl")] |
| 58 | +#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))] |
| 59 | +#[rustc_args_required_const(2)] |
| 60 | + macro_rules! verify_kat_pclmul { |
| 61 | + ($broadcast:ident, $clmul:ident, $assert:ident) => { |
| 62 | + // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf |
| 63 | + let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d); |
| 64 | + let a = $broadcast(a); |
| 65 | + let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d); |
| 66 | + let b = $broadcast(b); |
| 67 | + let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451); |
| 68 | + let r00 = $broadcast(r00); |
| 69 | + let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315); |
| 70 | + let r01 = $broadcast(r01); |
| 71 | + let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9); |
| 72 | + let r10 = $broadcast(r10); |
| 73 | + let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed); |
| 74 | + let r11 = $broadcast(r11); |
| 75 | + |
| 76 | + $assert($clmul(a, b, 0x00), r00); |
| 77 | + $assert($clmul(a, b, 0x10), r01); |
| 78 | + $assert($clmul(a, b, 0x01), r10); |
| 79 | + $assert($clmul(a, b, 0x11), r11); |
| 80 | + |
| 81 | + let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000); |
| 82 | + let a0 = $broadcast(a0); |
| 83 | + let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000); |
| 84 | + let r = $broadcast(r); |
| 85 | + $assert($clmul(a0, a0, 0x00), r); |
| 86 | + } |
| 87 | + } |
| 88 | + |
| 89 | + macro_rules! unroll { |
| 90 | + ($target:ident[4] = $op:ident($source:ident,4);) => { |
| 91 | + $target[3] = $op($source,3); |
| 92 | + $target[2] = $op($source,2); |
| 93 | + unroll!{$target[2] = $op($source,2);} |
| 94 | + }; |
| 95 | + ($target:ident[2] = $op:ident($source:ident,2);) => { |
| 96 | + $target[1] = $op($source,1); |
| 97 | + $target[0] = $op($source,0); |
| 98 | + }; |
| 99 | + (assert_eq_m128i($op:ident($vec_res:ident,4),$lin_res:ident[4]);) => { |
| 100 | + assert_eq_m128i($op($vec_res,3),$lin_res[3]); |
| 101 | + assert_eq_m128i($op($vec_res,2),$lin_res[2]); |
| 102 | + unroll!{assert_eq_m128i($op($vec_res,2),$lin_res[2]);} |
| 103 | + }; |
| 104 | + (assert_eq_m128i($op:ident($vec_res:ident,2),$lin_res:ident[2]);) => { |
| 105 | + assert_eq_m128i($op($vec_res,1),$lin_res[1]); |
| 106 | + assert_eq_m128i($op($vec_res,0),$lin_res[0]); |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + // this function tests one of the possible 4 instances |
| 111 | + // with different inputs across lanes |
| 112 | + #[target_feature(enable = "avx512vpclmulqdq,avx512f")] |
| 113 | + unsafe fn verify_512_helper(linear : unsafe fn(__m128i,__m128i)->__m128i, vectorized : unsafe fn(__m512i,__m512i)->__m512i) { |
| 114 | + let a = _mm512_set_epi64( |
| 115 | + 0xDCB4DB3657BF0B7D, 0x18DB0601068EDD9F, 0xB76B908233200DC5, 0xE478235FA8E22D5E, |
| 116 | + 0xAB05CFFA2621154C, 0x1171B47A186174C9, 0x8C6B6C0E7595CEC9, 0xBE3E7D4934E961BD |
| 117 | + ); |
| 118 | + let b = _mm512_set_epi64( |
| 119 | + 0x672F6F105A94CEA7, 0x8298B8FFCA5F829C, 0xA3927047B3FB61D8, 0x978093862CDE7187, |
| 120 | + 0xB1927AB22F31D0EC, 0xA9A5DA619BE4D7AF, 0xCA2590F56884FDC6, 0x19BE9F660038BDB5 |
| 121 | + ); |
| 122 | + |
| 123 | + let mut a_decomp = [_mm_setzero_si128();4]; |
| 124 | + unroll! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);} |
| 125 | + let mut b_decomp = [_mm_setzero_si128();4]; |
| 126 | + unroll! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);} |
| 127 | + |
| 128 | + let r = vectorized(a, b); |
| 129 | + let mut e_decomp = [_mm_setzero_si128();4]; |
| 130 | + for i in 0..4 { |
| 131 | + e_decomp[i] = linear(a_decomp[i],b_decomp[i]); |
| 132 | + } |
| 133 | + unroll!{assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);} |
| 134 | + } |
| 135 | + |
| 136 | + // this function tests one of the possible 4 instances |
| 137 | + // with different inputs across lanes for the VL version |
| 138 | + #[target_feature(enable = "avx512vpclmulqdq,avx512vl")] |
| 139 | + unsafe fn verify_256_helper(linear : unsafe fn(__m128i,__m128i)->__m128i, vectorized : unsafe fn(__m256i,__m256i)->__m256i) { |
| 140 | + let a = _mm512_set_epi64( |
| 141 | + 0xDCB4DB3657BF0B7D, 0x18DB0601068EDD9F, 0xB76B908233200DC5, 0xE478235FA8E22D5E, |
| 142 | + 0xAB05CFFA2621154C, 0x1171B47A186174C9, 0x8C6B6C0E7595CEC9, 0xBE3E7D4934E961BD |
| 143 | + ); |
| 144 | + let b = _mm512_set_epi64( |
| 145 | + 0x672F6F105A94CEA7, 0x8298B8FFCA5F829C, 0xA3927047B3FB61D8, 0x978093862CDE7187, |
| 146 | + 0xB1927AB22F31D0EC, 0xA9A5DA619BE4D7AF, 0xCA2590F56884FDC6, 0x19BE9F660038BDB5 |
| 147 | + ); |
| 148 | + |
| 149 | + let mut a_decomp = [_mm_setzero_si128();2]; |
| 150 | + unroll! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);} |
| 151 | + let mut b_decomp = [_mm_setzero_si128();2]; |
| 152 | + unroll! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);} |
| 153 | + |
| 154 | + let r = vectorized(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); |
| 155 | + let mut e_decomp = [_mm_setzero_si128();2]; |
| 156 | + for i in 0..2 { |
| 157 | + e_decomp[i] = linear(a_decomp[i],b_decomp[i]); |
| 158 | + } |
| 159 | + unroll!{assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);} |
| 160 | + } |
| 161 | + |
| 162 | + #[simd_test(enable = "avx512vpclmulqdq,avx512f")] |
| 163 | + unsafe fn test_mm512_clmulepi64_epi128() { |
| 164 | + verify_kat_pclmul!(_mm512_broadcast_i32x4,_mm512_clmulepi64_epi128,assert_eq_m512i); |
| 165 | + |
| 166 | + verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x00),|a,b|_mm512_clmulepi64_epi128(a, b, 0x00)); |
| 167 | + verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x01),|a,b|_mm512_clmulepi64_epi128(a, b, 0x01)); |
| 168 | + verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x10),|a,b|_mm512_clmulepi64_epi128(a, b, 0x10)); |
| 169 | + verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x11),|a,b|_mm512_clmulepi64_epi128(a, b, 0x11)); |
| 170 | + } |
| 171 | + |
| 172 | + #[simd_test(enable = "avx512vpclmulqdq,avx512vl")] |
| 173 | + unsafe fn test_mm256_clmulepi64_epi128() { |
| 174 | + verify_kat_pclmul!(_mm256_broadcastsi128_si256,_mm256_clmulepi64_epi128,assert_eq_m256i); |
| 175 | + |
| 176 | + verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x00),|a,b|_mm256_clmulepi64_epi128(a, b, 0x00)); |
| 177 | + verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x01),|a,b|_mm256_clmulepi64_epi128(a, b, 0x01)); |
| 178 | + verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x10),|a,b|_mm256_clmulepi64_epi128(a, b, 0x10)); |
| 179 | + verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x11),|a,b|_mm256_clmulepi64_epi128(a, b, 0x11)); |
| 180 | + } |
| 181 | +} |
0 commit comments