Skip to content

Commit f0a1a04

Browse files
committed
Fix non-clang build (probably)
Fix some mistakes
1 parent 89e4aee commit f0a1a04

File tree

1 file changed

+25
-5
lines changed

1 file changed

+25
-5
lines changed

src/mono/mono/metadata/metadata.c

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1495,7 +1495,9 @@ mono_metadata_decode_row_col_raw (const MonoTableInfo *t, int idx, guint col)
14951495
return 0;
14961496
}
14971497

1498-
static MONO_ALWAYS_INLINE guint32
1498+
// This will inline into mono_metadata_decode_value_simd on targets that can't use
1499+
// the simd version of the algorithm.
1500+
MONO_ALWAYS_INLINE static guint32
14991501
decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
15001502
{
15011503
guint32 result;
@@ -1519,9 +1521,13 @@ decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
15191521
ptr += 5;
15201522
}
15211523

1524+
if (new_ptr)
1525+
*new_ptr = ptr;
1526+
15221527
return result;
15231528
}
15241529

1530+
// This is meant to be wrapped by our existing decode_value functions, and inline into them.
15251531
MONO_ALWAYS_INLINE guint32
15261532
mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15271533
{
@@ -1531,7 +1537,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15311537
typedef guint8 v64_u1 __attribute__ ((vector_size (8)));
15321538
typedef guint32 v64_u4 __attribute__ ((vector_size (8)));
15331539

1534-
// this will generate vectorized code on x64 and wasm as long as SIMD is enabled
1540+
// this will generate vectorized code on x64 and wasm as long as SIMD is enabled at build time.
15351541
// if simd isn't enabled, it generates fairly adequate scalar code.
15361542
// *(bytes *)ptr and *(guint32 *)ptr by themselves don't force an i32 load of
15371543
// ptr in either x64 or wasm clang, so this is the only way to prefetch all the bytes
@@ -1540,13 +1546,15 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15401546
// we could overrun the source buffer by up to 7 bytes, but doing that on wasm is
15411547
// safe unless we're decoding from the absolute end of memory.
15421548
// we pad all buffers by 16 bytes in mono_wasm_load_bytes_into_heap, so we're fine
1543-
// clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte
1549+
// clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte-wide
15441550
// insns on x64 as appropriate. armv8 looks fine too, albeit a little weird
15451551
union {
15461552
v64_u1 b;
15471553
v64_u4 i;
15481554
} v;
1549-
v.b = *(v64_u1 *)ptr;
1555+
// ideally we would load 5 bytes, but it won't use a vector load then
1556+
// memcpy instead of a regular pointer dereference, to say 'this is unaligned'
1557+
memcpy(&v, ptr, sizeof(v));
15501558
// mask and shift two bits so we can have a 4-element jump table in wasm
15511559
guint8 flags = (v.b[0] & (0x80u | 0x40u)) >> 6;
15521560
switch (flags) {
@@ -1571,8 +1579,13 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15711579
// i don't know why the default case is necessary here, but without it the jump table has 5 entries.
15721580
default:
15731581
// (b * 0x80) != 0, and (b & 0x40) != 0
1582+
// for some reason on wasm the 'v.b[0]' load generates an '& 255',
1583+
// even if we cache it in a guint8 local
15741584
if (v.b[0] == 0xFFu) {
15751585
// v.b = { ptr[4], ptr[3], ptr[2], ptr[1] }
1586+
// on x64 this generates kind of gross code, i.e.
1587+
// pshufd, pshufhw, pshufd, pshuflw, packuswb
1588+
// but on wasm it's fine
15761589
v.b = __builtin_shufflevector(
15771590
v.b, v.b,
15781591
4, 3, 2, 1, -1, -1, -1, -1
@@ -1582,12 +1595,19 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15821595
ptr += 5;
15831596
} else {
15841597
// v.b = { ptr[3], ptr[2], ptr[1], ptr[0] }
1598+
#ifdef USE_BSWAP
1599+
// generates much smaller x64 assembly, but terrible wasm
1600+
// interestingly, clang for arm automatically does this to the shuffle below
1601+
result = __builtin_bswap32(v.i[0]) & 0x1FFFFFFFu;
1602+
#else
1603+
// on x64 unlike the above this generates a single pshuflw + pack
15851604
v.b = __builtin_shufflevector(
15861605
v.b, v.b,
15871606
3, 2, 1, 0, -1, -1, -1, -1
15881607
);
15891608
// result = v.b[0..3] where v.b[0] &= 0x1F
15901609
result = v.i[0] & 0x1FFFFFFFu;
1610+
#endif
15911611
ptr += 4;
15921612
}
15931613
break;
@@ -1598,7 +1618,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15981618

15991619
return result;
16001620
#else
1601-
return decode_value_scalar (ptr, rptr);
1621+
return decode_value_scalar (ptr, new_ptr);
16021622
#endif
16031623
}
16041624

0 commit comments

Comments
 (0)