Skip to content

Commit d26b37a

Browse files
committed
Fix non-clang build (probably)
Fix some mistakes
1 parent c467360 commit d26b37a

File tree

1 file changed

+25
-5
lines changed

1 file changed

+25
-5
lines changed

src/mono/mono/metadata/metadata.c

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,7 +1502,9 @@ mono_metadata_decode_row_col_raw (const MonoTableInfo *t, int idx, guint col)
15021502
return 0;
15031503
}
15041504

1505-
static MONO_ALWAYS_INLINE guint32
1505+
// This will inline into mono_metadata_decode_value_simd on targets that can't use
1506+
// the simd version of the algorithm.
1507+
MONO_ALWAYS_INLINE static guint32
15061508
decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
15071509
{
15081510
guint32 result;
@@ -1526,9 +1528,13 @@ decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
15261528
ptr += 5;
15271529
}
15281530

1531+
if (new_ptr)
1532+
*new_ptr = ptr;
1533+
15291534
return result;
15301535
}
15311536

1537+
// This is meant to be wrapped by our existing decode_value functions, and inline into them.
15321538
MONO_ALWAYS_INLINE guint32
15331539
mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15341540
{
@@ -1538,7 +1544,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15381544
typedef guint8 v64_u1 __attribute__ ((vector_size (8)));
15391545
typedef guint32 v64_u4 __attribute__ ((vector_size (8)));
15401546

1541-
// this will generate vectorized code on x64 and wasm as long as SIMD is enabled
1547+
// this will generate vectorized code on x64 and wasm as long as SIMD is enabled at build time.
15421548
// if simd isn't enabled, it generates fairly adequate scalar code.
15431549
// *(bytes *)ptr and *(guint32 *)ptr by themselves don't force an i32 load of
15441550
// ptr in either x64 or wasm clang, so this is the only way to prefetch all the bytes
@@ -1547,13 +1553,15 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15471553
// we could overrun the source buffer by up to 7 bytes, but doing that on wasm is
15481554
// safe unless we're decoding from the absolute end of memory.
15491555
// we pad all buffers by 16 bytes in mono_wasm_load_bytes_into_heap, so we're fine
1550-
// clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte
1556+
// clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte-wide
15511557
// insns on x64 as appropriate. armv8 looks fine too, albeit a little weird
15521558
union {
15531559
v64_u1 b;
15541560
v64_u4 i;
15551561
} v;
1556-
v.b = *(v64_u1 *)ptr;
1562+
// ideally we would load 5 bytes, but it won't use a vector load then
1563+
// memcpy instead of a regular pointer dereference, to say 'this is unaligned'
1564+
memcpy(&v, ptr, sizeof(v));
15571565
// mask and shift two bits so we can have a 4-element jump table in wasm
15581566
guint8 flags = (v.b[0] & (0x80u | 0x40u)) >> 6;
15591567
switch (flags) {
@@ -1578,8 +1586,13 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15781586
// i don't know why the default case is necessary here, but without it the jump table has 5 entries.
15791587
default:
15801588
// (b * 0x80) != 0, and (b & 0x40) != 0
1589+
// for some reason on wasm the 'v.b[0]' load generates an '& 255',
1590+
// even if we cache it in a guint8 local
15811591
if (v.b[0] == 0xFFu) {
15821592
// v.b = { ptr[4], ptr[3], ptr[2], ptr[1] }
1593+
// on x64 this generates kind of gross code, i.e.
1594+
// pshufd, pshufhw, pshufd, pshuflw, packuswb
1595+
// but on wasm it's fine
15831596
v.b = __builtin_shufflevector(
15841597
v.b, v.b,
15851598
4, 3, 2, 1, -1, -1, -1, -1
@@ -1589,12 +1602,19 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
15891602
ptr += 5;
15901603
} else {
15911604
// v.b = { ptr[3], ptr[2], ptr[1], ptr[0] }
1605+
#ifdef USE_BSWAP
1606+
// generates much smaller x64 assembly, but terrible wasm
1607+
// interestingly, clang for arm automatically does this to the shuffle below
1608+
result = __builtin_bswap32(v.i[0]) & 0x1FFFFFFFu;
1609+
#else
1610+
// on x64 unlike the above this generates a single pshuflw + pack
15921611
v.b = __builtin_shufflevector(
15931612
v.b, v.b,
15941613
3, 2, 1, 0, -1, -1, -1, -1
15951614
);
15961615
// result = v.b[0..3] where v.b[0] &= 0x1F
15971616
result = v.i[0] & 0x1FFFFFFFu;
1617+
#endif
15981618
ptr += 4;
15991619
}
16001620
break;
@@ -1605,7 +1625,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
16051625

16061626
return result;
16071627
#else
1608-
return decode_value_scalar (ptr, rptr);
1628+
return decode_value_scalar (ptr, new_ptr);
16091629
#endif
16101630
}
16111631

0 commit comments

Comments
 (0)