@@ -1495,7 +1495,9 @@ mono_metadata_decode_row_col_raw (const MonoTableInfo *t, int idx, guint col)
14951495 return 0 ;
14961496}
14971497
1498- static MONO_ALWAYS_INLINE guint32
1498+ // This will inline into mono_metadata_decode_value_simd on targets that can't use
1499+ // the simd version of the algorithm.
1500+ MONO_ALWAYS_INLINE static guint32
14991501decode_value_scalar (const guint8 * ptr , const guint8 * * new_ptr )
15001502{
15011503 guint32 result ;
@@ -1519,9 +1521,13 @@ decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
15191521 ptr += 5 ;
15201522 }
15211523
1524+ if (new_ptr )
1525+ * new_ptr = ptr ;
1526+
15221527 return result ;
15231528}
15241529
1530+ // This is meant to be wrapped by our existing decode_value functions, and inline into them.
15251531MONO_ALWAYS_INLINE guint32
15261532mono_metadata_decode_value_simd (const guint8 * ptr , const guint8 * * new_ptr )
15271533{
@@ -1531,7 +1537,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15311537 typedef guint8 v64_u1 __attribute__ ((vector_size (8 )));
15321538 typedef guint32 v64_u4 __attribute__ ((vector_size (8 )));
15331539
1534- // this will generate vectorized code on x64 and wasm as long as SIMD is enabled
1540+ // this will generate vectorized code on x64 and wasm as long as SIMD is enabled at build time.
15351541 // if simd isn't enabled, it generates fairly adequate scalar code.
15361542 // *(bytes *)ptr and *(guint32 *)ptr by themselves don't force an i32 load of
15371543 // ptr in either x64 or wasm clang, so this is the only way to prefetch all the bytes
@@ -1540,13 +1546,15 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15401546 // we could overrun the source buffer by up to 7 bytes, but doing that on wasm is
15411547 // safe unless we're decoding from the absolute end of memory.
15421548 // we pad all buffers by 16 bytes in mono_wasm_load_bytes_into_heap, so we're fine
1543- // clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte
1549+ // clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte-wide
15441550 // insns on x64 as appropriate. armv8 looks fine too, albeit a little weird
15451551 union {
15461552 v64_u1 b ;
15471553 v64_u4 i ;
15481554 } v ;
1549- v .b = * (v64_u1 * )ptr ;
1555+ // ideally we would load 5 bytes, but it won't use a vector load then
1556+ // memcpy instead of a regular pointer dereference, to say 'this is unaligned'
1557+ memcpy (& v , ptr , sizeof (v ));
15501558 // mask and shift two bits so we can have a 4-element jump table in wasm
15511559 guint8 flags = (v .b [0 ] & (0x80u | 0x40u )) >> 6 ;
15521560 switch (flags ) {
@@ -1571,8 +1579,13 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15711579 // i don't know why the default case is necessary here, but without it the jump table has 5 entries.
15721580 default :
15731581 // (b * 0x80) != 0, and (b & 0x40) != 0
1582+ // for some reason on wasm the 'v.b[0]' load generates an '& 255',
1583+ // even if we cache it in a guint8 local
15741584 if (v .b [0 ] == 0xFFu ) {
15751585 // v.b = { ptr[4], ptr[3], ptr[2], ptr[1] }
1586+ // on x64 this generates kind of gross code, i.e.
1587+ // pshufd, pshufhw, pshufd, pshuflw, packuswb
1588+ // but on wasm it's fine
15761589 v .b = __builtin_shufflevector (
15771590 v .b , v .b ,
15781591 4 , 3 , 2 , 1 , -1 , -1 , -1 , -1
@@ -1582,12 +1595,19 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15821595 ptr += 5 ;
15831596 } else {
15841597 // v.b = { ptr[3], ptr[2], ptr[1], ptr[0] }
1598+ #ifdef USE_BSWAP
1599+ // generates much smaller x64 assembly, but terrible wasm
1600+ // interestingly, clang for arm automatically does this to the shuffle below
1601+ result = __builtin_bswap32 (v .i [0 ]) & 0x1FFFFFFFu ;
1602+ #else
1603+ // on x64 unlike the above this generates a single pshuflw + pack
15851604 v .b = __builtin_shufflevector (
15861605 v .b , v .b ,
15871606 3 , 2 , 1 , 0 , -1 , -1 , -1 , -1
15881607 );
15891608 // result = v.b[0..3] where v.b[0] &= 0x1F
15901609 result = v .i [0 ] & 0x1FFFFFFFu ;
1610+ #endif
15911611 ptr += 4 ;
15921612 }
15931613 break ;
@@ -1598,7 +1618,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15981618
15991619 return result ;
16001620#else
1601- return decode_value_scalar (ptr , rptr );
1621+ return decode_value_scalar (ptr , new_ptr );
16021622#endif
16031623}
16041624
0 commit comments