@@ -1502,7 +1502,9 @@ mono_metadata_decode_row_col_raw (const MonoTableInfo *t, int idx, guint col)
15021502 return 0 ;
15031503}
15041504
1505- static MONO_ALWAYS_INLINE guint32
1505+ // This will inline into mono_metadata_decode_value_simd on targets that can't use
1506+ // the simd version of the algorithm.
1507+ MONO_ALWAYS_INLINE static guint32
15061508decode_value_scalar (const guint8 * ptr , const guint8 * * new_ptr )
15071509{
15081510 guint32 result ;
@@ -1526,9 +1528,13 @@ decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
15261528 ptr += 5 ;
15271529 }
15281530
1531+ if (new_ptr )
1532+ * new_ptr = ptr ;
1533+
15291534 return result ;
15301535}
15311536
1537+ // This is meant to be wrapped by our existing decode_value functions, and inline into them.
15321538MONO_ALWAYS_INLINE guint32
15331539mono_metadata_decode_value_simd (const guint8 * ptr , const guint8 * * new_ptr )
15341540{
@@ -1538,7 +1544,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15381544 typedef guint8 v64_u1 __attribute__ ((vector_size (8 )));
15391545 typedef guint32 v64_u4 __attribute__ ((vector_size (8 )));
15401546
1541- // this will generate vectorized code on x64 and wasm as long as SIMD is enabled
1547+ // this will generate vectorized code on x64 and wasm as long as SIMD is enabled at build time.
15421548 // if simd isn't enabled, it generates fairly adequate scalar code.
15431549 // *(bytes *)ptr and *(guint32 *)ptr by themselves don't force an i32 load of
15441550 // ptr in either x64 or wasm clang, so this is the only way to prefetch all the bytes
@@ -1547,13 +1553,15 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15471553 // we could overrun the source buffer by up to 7 bytes, but doing that on wasm is
15481554 // safe unless we're decoding from the absolute end of memory.
15491555 // we pad all buffers by 16 bytes in mono_wasm_load_bytes_into_heap, so we're fine
1550- // clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte
1556+ // clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte-wide
15511557 // insns on x64 as appropriate. armv8 looks fine too, albeit a little weird
15521558 union {
15531559 v64_u1 b ;
15541560 v64_u4 i ;
15551561 } v ;
1556- v .b = * (v64_u1 * )ptr ;
1562+ // ideally we would load 5 bytes, but it won't use a vector load then
1563+ // memcpy instead of a regular pointer dereference, to say 'this is unaligned'
1564+ memcpy (& v , ptr , sizeof (v ));
15571565 // mask and shift two bits so we can have a 4-element jump table in wasm
15581566 guint8 flags = (v .b [0 ] & (0x80u | 0x40u )) >> 6 ;
15591567 switch (flags ) {
@@ -1578,8 +1586,13 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15781586 // i don't know why the default case is necessary here, but without it the jump table has 5 entries.
15791587 default :
15801588 // (b * 0x80) != 0, and (b & 0x40) != 0
1589+ // for some reason on wasm the 'v.b[0]' load generates an '& 255',
1590+ // even if we cache it in a guint8 local
15811591 if (v .b [0 ] == 0xFFu ) {
15821592 // v.b = { ptr[4], ptr[3], ptr[2], ptr[1] }
1593+ // on x64 this generates kind of gross code, i.e.
1594+ // pshufd, pshufhw, pshufd, pshuflw, packuswb
1595+ // but on wasm it's fine
15831596 v .b = __builtin_shufflevector (
15841597 v .b , v .b ,
15851598 4 , 3 , 2 , 1 , -1 , -1 , -1 , -1
@@ -1589,12 +1602,19 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15891602 ptr += 5 ;
15901603 } else {
15911604 // v.b = { ptr[3], ptr[2], ptr[1], ptr[0] }
1605+ #ifdef USE_BSWAP
1606+ // generates much smaller x64 assembly, but terrible wasm
1607+ // interestingly, clang for arm automatically does this to the shuffle below
1608+ result = __builtin_bswap32 (v .i [0 ]) & 0x1FFFFFFFu ;
1609+ #else
1610+ // on x64 unlike the above this generates a single pshuflw + pack
15921611 v .b = __builtin_shufflevector (
15931612 v .b , v .b ,
15941613 3 , 2 , 1 , 0 , -1 , -1 , -1 , -1
15951614 );
15961615 // result = v.b[0..3] where v.b[0] &= 0x1F
15971616 result = v .i [0 ] & 0x1FFFFFFFu ;
1617+ #endif
15981618 ptr += 4 ;
15991619 }
16001620 break ;
@@ -1605,7 +1625,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
16051625
16061626 return result ;
16071627#else
1608- return decode_value_scalar (ptr , rptr );
1628+ return decode_value_scalar (ptr , new_ptr );
16091629#endif
16101630}
16111631
0 commit comments