Fix non-clang build (probably)

kg · kg · commit f0a1a0411054 · 2024-03-21T10:04:37.000-07:00
Fix some mistakes
diff --git a/src/mono/mono/metadata/metadata.c b/src/mono/mono/metadata/metadata.c
@@ -1495,7 +1495,9 @@ mono_metadata_decode_row_col_raw (const MonoTableInfo *t, int idx, guint col)
 	return 0;
 }
 
-static MONO_ALWAYS_INLINE guint32
+// This will inline into mono_metadata_decode_value_simd on targets that can't use
+//  the simd version of the algorithm.
+MONO_ALWAYS_INLINE static guint32
 decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
 {
 	guint32 result;
@@ -1519,9 +1521,13 @@ decode_value_scalar (const guint8 *ptr, const guint8 **new_ptr)
 		ptr += 5;
 	}
 
+	if (new_ptr)
+		*new_ptr = ptr;
+
 	return result;
 }
 
+// This is meant to be wrapped by our existing decode_value functions, and inline into them.
 MONO_ALWAYS_INLINE guint32
 mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
 {
@@ -1531,7 +1537,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
 	typedef guint8 v64_u1 __attribute__ ((vector_size (8)));
 	typedef guint32 v64_u4 __attribute__ ((vector_size (8)));
 
-	// this will generate vectorized code on x64 and wasm as long as SIMD is enabled
+	// this will generate vectorized code on x64 and wasm as long as SIMD is enabled at build time.
 	// if simd isn't enabled, it generates fairly adequate scalar code.
 	// *(bytes *)ptr and *(guint32 *)ptr by themselves don't force an i32 load of
 	//  ptr in either x64 or wasm clang, so this is the only way to prefetch all the bytes
@@ -1540,13 +1546,15 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
 	// we could overrun the source buffer by up to 7 bytes, but doing that on wasm is
 	//  safe unless we're decoding from the absolute end of memory.
 	// we pad all buffers by 16 bytes in mono_wasm_load_bytes_into_heap, so we're fine
-	// clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte
+	// clang happily upsizes these 8-byte vectors into 16-byte ones for wasm, and uses 8-byte-wide
 	//  insns on x64 as appropriate. armv8 looks fine too, albeit a little weird
 	union {
 		v64_u1 b;
 		v64_u4 i;
 	} v;
-	v.b = *(v64_u1 *)ptr;
+	// ideally we would load 5 bytes, but it won't use a vector load then
+	// memcpy instead of a regular pointer dereference, to say 'this is unaligned'
+	memcpy(&v, ptr, sizeof(v));
 	// mask and shift two bits so we can have a 4-element jump table in wasm
 	guint8 flags = (v.b[0] & (0x80u | 0x40u)) >> 6;
 	switch (flags) {
@@ -1571,8 +1579,13 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
 		// i don't know why the default case is necessary here, but without it the jump table has 5 entries.
 		default:
 			// (b * 0x80) != 0, and (b & 0x40) != 0
+			// for some reason on wasm the 'v.b[0]' load generates an '& 255',
+			//  even if we cache it in a guint8 local
 			if (v.b[0] == 0xFFu) {
 				// v.b = { ptr[4], ptr[3], ptr[2], ptr[1] }
+				// on x64 this generates kind of gross code, i.e.
+				// pshufd, pshufhw, pshufd, pshuflw, packuswb
+				// but on wasm it's fine
 				v.b = __builtin_shufflevector(
 					v.b, v.b,
 					4, 3, 2, 1, -1, -1, -1, -1
@@ -1582,12 +1595,19 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
 				ptr += 5;
 			} else {
 				// v.b = { ptr[3], ptr[2], ptr[1], ptr[0] }
+#ifdef USE_BSWAP
+				// generates much smaller x64 assembly, but terrible wasm
+				// interestingly, clang for arm automatically does this to the shuffle below
+				result = __builtin_bswap32(v.i[0]) & 0x1FFFFFFFu;
+#else
+				// on x64 unlike the above this generates a single pshuflw + pack
 				v.b = __builtin_shufflevector(
 					v.b, v.b,
 					3, 2, 1, 0, -1, -1, -1, -1
 				);
 				// result = v.b[0..3] where v.b[0] &= 0x1F
 				result = v.i[0] & 0x1FFFFFFFu;
+#endif
 				ptr += 4;
 			}
 			break;
@@ -1598,7 +1618,7 @@ mono_metadata_decode_value_simd (const guint8 *ptr, const guint8 **new_ptr)
 
 	return result;
 #else
-	return decode_value_scalar (ptr, rptr);
+	return decode_value_scalar (ptr, new_ptr);
 #endif
 }