Skip to content

Commit 4bbb61a

Browse files
committed
Add VPCLMULQDQ Instructions
This adds the vectorized PCLMULQDQ extensions. Test vectors are the same as for the old version or compare outputs.
1 parent 9b9a7d5 commit 4bbb61a

File tree

2 files changed

+184
-0
lines changed

2 files changed

+184
-0
lines changed
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
//! Vectorized Carry-less Multiplication (VCLMUL)
2+
//!
3+
//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
4+
//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
5+
//!
6+
//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
7+
8+
use crate::core_arch::x86::__m256i;
9+
use crate::core_arch::x86::__m512i;
10+
11+
#[cfg(test)]
12+
use crate::stdarch_test::assert_instr;
13+
14+
#[allow(improper_ctypes)]
15+
extern "C" {
16+
#[link_name = "llvm.x86.pclmulqdq.256"]
17+
fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
18+
#[link_name = "llvm.x86.pclmulqdq.512"]
19+
fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
20+
}
21+
22+
// for some odd reason on x86_64 we generate the correct long name instructions
23+
// but on i686 we generate the short name + imm8
24+
// so we need to special-case on that...
25+
26+
/// Performs a carry-less multiplication of two 64-bit polynomials over the
27+
/// finite field GF(2^k) - in each of the 4 128-bit lanes.
28+
///
29+
/// The immediate byte is used for determining which halves of each lane `a` and `b`
30+
/// should be used. Immediate bits other than 0 and 4 are ignored.
31+
/// All lanes share immediate byte.
32+
///
33+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
34+
#[inline]
35+
#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
36+
// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
37+
#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
38+
#[rustc_args_required_const(2)]
39+
pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
40+
macro_rules! call {
41+
($imm8:expr) => {
42+
pclmulqdq_512(a, b, $imm8)
43+
};
44+
}
45+
constify_imm8!(imm8, call)
46+
}
47+
48+
/// Performs a carry-less multiplication of two 64-bit polynomials over the
49+
/// finite field GF(2^k) - in each of the 2 128-bit lanes.
50+
///
51+
/// The immediate byte is used for determining which halves of each lane `a` and `b`
52+
/// should be used. Immediate bits other than 0 and 4 are ignored.
53+
/// All lanes share immediate byte.
54+
///
55+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
56+
#[inline]
57+
#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
58+
#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
59+
#[rustc_args_required_const(2)]
60+
macro_rules! verify_kat_pclmul {
61+
($broadcast:ident, $clmul:ident, $assert:ident) => {
62+
// Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
63+
let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
64+
let a = $broadcast(a);
65+
let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
66+
let b = $broadcast(b);
67+
let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
68+
let r00 = $broadcast(r00);
69+
let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
70+
let r01 = $broadcast(r01);
71+
let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
72+
let r10 = $broadcast(r10);
73+
let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
74+
let r11 = $broadcast(r11);
75+
76+
$assert($clmul(a, b, 0x00), r00);
77+
$assert($clmul(a, b, 0x10), r01);
78+
$assert($clmul(a, b, 0x01), r10);
79+
$assert($clmul(a, b, 0x11), r11);
80+
81+
let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
82+
let a0 = $broadcast(a0);
83+
let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
84+
let r = $broadcast(r);
85+
$assert($clmul(a0, a0, 0x00), r);
86+
}
87+
}
88+
89+
macro_rules! unroll {
90+
($target:ident[4] = $op:ident($source:ident,4);) => {
91+
$target[3] = $op($source,3);
92+
$target[2] = $op($source,2);
93+
unroll!{$target[2] = $op($source,2);}
94+
};
95+
($target:ident[2] = $op:ident($source:ident,2);) => {
96+
$target[1] = $op($source,1);
97+
$target[0] = $op($source,0);
98+
};
99+
(assert_eq_m128i($op:ident($vec_res:ident,4),$lin_res:ident[4]);) => {
100+
assert_eq_m128i($op($vec_res,3),$lin_res[3]);
101+
assert_eq_m128i($op($vec_res,2),$lin_res[2]);
102+
unroll!{assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
103+
};
104+
(assert_eq_m128i($op:ident($vec_res:ident,2),$lin_res:ident[2]);) => {
105+
assert_eq_m128i($op($vec_res,1),$lin_res[1]);
106+
assert_eq_m128i($op($vec_res,0),$lin_res[0]);
107+
}
108+
}
109+
110+
// this function tests one of the possible 4 instances
111+
// with different inputs across lanes
112+
#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
113+
unsafe fn verify_512_helper(linear : unsafe fn(__m128i,__m128i)->__m128i, vectorized : unsafe fn(__m512i,__m512i)->__m512i) {
114+
let a = _mm512_set_epi64(
115+
0xDCB4DB3657BF0B7D, 0x18DB0601068EDD9F, 0xB76B908233200DC5, 0xE478235FA8E22D5E,
116+
0xAB05CFFA2621154C, 0x1171B47A186174C9, 0x8C6B6C0E7595CEC9, 0xBE3E7D4934E961BD
117+
);
118+
let b = _mm512_set_epi64(
119+
0x672F6F105A94CEA7, 0x8298B8FFCA5F829C, 0xA3927047B3FB61D8, 0x978093862CDE7187,
120+
0xB1927AB22F31D0EC, 0xA9A5DA619BE4D7AF, 0xCA2590F56884FDC6, 0x19BE9F660038BDB5
121+
);
122+
123+
let mut a_decomp = [_mm_setzero_si128();4];
124+
unroll! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);}
125+
let mut b_decomp = [_mm_setzero_si128();4];
126+
unroll! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);}
127+
128+
let r = vectorized(a, b);
129+
let mut e_decomp = [_mm_setzero_si128();4];
130+
for i in 0..4 {
131+
e_decomp[i] = linear(a_decomp[i],b_decomp[i]);
132+
}
133+
unroll!{assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
134+
}
135+
136+
// this function tests one of the possible 4 instances
137+
// with different inputs across lanes for the VL version
138+
#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
139+
unsafe fn verify_256_helper(linear : unsafe fn(__m128i,__m128i)->__m128i, vectorized : unsafe fn(__m256i,__m256i)->__m256i) {
140+
let a = _mm512_set_epi64(
141+
0xDCB4DB3657BF0B7D, 0x18DB0601068EDD9F, 0xB76B908233200DC5, 0xE478235FA8E22D5E,
142+
0xAB05CFFA2621154C, 0x1171B47A186174C9, 0x8C6B6C0E7595CEC9, 0xBE3E7D4934E961BD
143+
);
144+
let b = _mm512_set_epi64(
145+
0x672F6F105A94CEA7, 0x8298B8FFCA5F829C, 0xA3927047B3FB61D8, 0x978093862CDE7187,
146+
0xB1927AB22F31D0EC, 0xA9A5DA619BE4D7AF, 0xCA2590F56884FDC6, 0x19BE9F660038BDB5
147+
);
148+
149+
let mut a_decomp = [_mm_setzero_si128();2];
150+
unroll! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);}
151+
let mut b_decomp = [_mm_setzero_si128();2];
152+
unroll! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);}
153+
154+
let r = vectorized(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
155+
let mut e_decomp = [_mm_setzero_si128();2];
156+
for i in 0..2 {
157+
e_decomp[i] = linear(a_decomp[i],b_decomp[i]);
158+
}
159+
unroll!{assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);}
160+
}
161+
162+
#[simd_test(enable = "avx512vpclmulqdq,avx512f")]
163+
unsafe fn test_mm512_clmulepi64_epi128() {
164+
verify_kat_pclmul!(_mm512_broadcast_i32x4,_mm512_clmulepi64_epi128,assert_eq_m512i);
165+
166+
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x00),|a,b|_mm512_clmulepi64_epi128(a, b, 0x00));
167+
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x01),|a,b|_mm512_clmulepi64_epi128(a, b, 0x01));
168+
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x10),|a,b|_mm512_clmulepi64_epi128(a, b, 0x10));
169+
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x11),|a,b|_mm512_clmulepi64_epi128(a, b, 0x11));
170+
}
171+
172+
#[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
173+
unsafe fn test_mm256_clmulepi64_epi128() {
174+
verify_kat_pclmul!(_mm256_broadcastsi128_si256,_mm256_clmulepi64_epi128,assert_eq_m256i);
175+
176+
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x00),|a,b|_mm256_clmulepi64_epi128(a, b, 0x00));
177+
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x01),|a,b|_mm256_clmulepi64_epi128(a, b, 0x01));
178+
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x10),|a,b|_mm256_clmulepi64_epi128(a, b, 0x10));
179+
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x11),|a,b|_mm256_clmulepi64_epi128(a, b, 0x11));
180+
}
181+
}

crates/core_arch/src/x86/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,9 @@ pub use self::avx512f::*;
651651
mod avx512ifma;
652652
pub use self::avx512ifma::*;
653653

654+
mod avx512vpclmulqdq;
655+
pub use self::avx512vpclmulqdq::*;
656+
654657
mod bt;
655658
pub use self::bt::*;
656659

0 commit comments

Comments
 (0)