Skip to content

Commit 3406cba

Browse files
ngzhianCommit Bot
authored andcommitted
[wasm-simd][arm64] Bitmask instructions
Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on interpreter and arm64. These operations are behind wasm_simd_post_mvp flag, as we are only prototyping to evaluate performance. The codegen is based on guidance at WebAssembly/simd#201. Bug: v8:10308 Change-Id: I835aa8a23e677a00ee7897c1c31a028850e238a9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2099451 Reviewed-by: Tobias Tebbi <[email protected]> Reviewed-by: Deepti Gandluri <[email protected]> Commit-Queue: Zhi An Ng <[email protected]> Cr-Commit-Position: refs/heads/master@{#66793}
1 parent ca5ee9d commit 3406cba

13 files changed

+193
-0
lines changed

src/compiler/backend/arm64/code-generator-arm64.cc

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2128,6 +2128,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
21282128
SIMD_BINOP_CASE(kArm64I32x4GtU, Cmhi, 4S);
21292129
SIMD_BINOP_CASE(kArm64I32x4GeU, Cmhs, 4S);
21302130
SIMD_UNOP_CASE(kArm64I32x4Abs, Abs, 4S);
2131+
case kArm64I32x4BitMask: {
2132+
Register dst = i.OutputRegister32();
2133+
VRegister src = i.InputSimd128Register(0);
2134+
VRegister tmp = i.TempSimd128Register(0);
2135+
VRegister mask = i.TempSimd128Register(1);
2136+
2137+
__ Sshr(tmp.V4S(), src.V4S(), 31);
2138+
// Set i-th bit of each lane i. When AND with tmp, the lanes that
2139+
// are signed will have i-th bit set, unsigned will be 0.
2140+
__ Movi(mask.V2D(), 0x0000'0008'0000'0004, 0x0000'0002'0000'0001);
2141+
__ And(tmp.V16B(), mask.V16B(), tmp.V16B());
2142+
__ Addv(tmp.S(), tmp.V4S());
2143+
__ Mov(dst.W(), tmp.V4S(), 0);
2144+
break;
2145+
}
21312146
case kArm64I16x8Splat: {
21322147
__ Dup(i.OutputSimd128Register().V8H(), i.InputRegister32(0));
21332148
break;
@@ -2229,6 +2244,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
22292244
SIMD_BINOP_CASE(kArm64I16x8GeU, Cmhs, 8H);
22302245
SIMD_BINOP_CASE(kArm64I16x8RoundingAverageU, Urhadd, 8H);
22312246
SIMD_UNOP_CASE(kArm64I16x8Abs, Abs, 8H);
2247+
case kArm64I16x8BitMask: {
2248+
Register dst = i.OutputRegister32();
2249+
VRegister src = i.InputSimd128Register(0);
2250+
VRegister tmp = i.TempSimd128Register(0);
2251+
VRegister mask = i.TempSimd128Register(1);
2252+
2253+
__ Sshr(tmp.V8H(), src.V8H(), 15);
2254+
// Set i-th bit of each lane i. When AND with tmp, the lanes that
2255+
// are signed will have i-th bit set, unsigned will be 0.
2256+
__ Movi(mask.V2D(), 0x0080'0040'0020'0010, 0x0008'0004'0002'0001);
2257+
__ And(tmp.V16B(), mask.V16B(), tmp.V16B());
2258+
__ Addv(tmp.H(), tmp.V8H());
2259+
__ Mov(dst.W(), tmp.V8H(), 0);
2260+
break;
2261+
}
22322262
case kArm64I8x16Splat: {
22332263
__ Dup(i.OutputSimd128Register().V16B(), i.InputRegister32(0));
22342264
break;
@@ -2318,6 +2348,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
23182348
SIMD_BINOP_CASE(kArm64I8x16GeU, Cmhs, 16B);
23192349
SIMD_BINOP_CASE(kArm64I8x16RoundingAverageU, Urhadd, 16B);
23202350
SIMD_UNOP_CASE(kArm64I8x16Abs, Abs, 16B);
2351+
case kArm64I8x16BitMask: {
2352+
Register dst = i.OutputRegister32();
2353+
VRegister src = i.InputSimd128Register(0);
2354+
VRegister tmp = i.TempSimd128Register(0);
2355+
VRegister mask = i.TempSimd128Register(1);
2356+
2357+
// Set i-th bit of each lane i. When AND with tmp, the lanes that
2358+
// are signed will have i-th bit set, unsigned will be 0.
2359+
__ Sshr(tmp.V16B(), src.V16B(), 7);
2360+
__ Movi(mask.V2D(), 0x8040'2010'0804'0201);
2361+
__ And(tmp.V16B(), mask.V16B(), tmp.V16B());
2362+
__ Ext(mask.V16B(), tmp.V16B(), tmp.V16B(), 8);
2363+
__ Zip1(tmp.V16B(), tmp.V16B(), mask.V16B());
2364+
__ Addv(tmp.H(), tmp.V8H());
2365+
__ Mov(dst.W(), tmp.V8H(), 0);
2366+
break;
2367+
}
23212368
case kArm64S128Zero: {
23222369
__ Movi(i.OutputSimd128Register().V16B(), 0);
23232370
break;

src/compiler/backend/arm64/instruction-codes-arm64.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ namespace compiler {
253253
V(Arm64I32x4GtU) \
254254
V(Arm64I32x4GeU) \
255255
V(Arm64I32x4Abs) \
256+
V(Arm64I32x4BitMask) \
256257
V(Arm64I16x8Splat) \
257258
V(Arm64I16x8ExtractLaneU) \
258259
V(Arm64I16x8ExtractLaneS) \
@@ -287,6 +288,7 @@ namespace compiler {
287288
V(Arm64I16x8GeU) \
288289
V(Arm64I16x8RoundingAverageU) \
289290
V(Arm64I16x8Abs) \
291+
V(Arm64I16x8BitMask) \
290292
V(Arm64I8x16Splat) \
291293
V(Arm64I8x16ExtractLaneU) \
292294
V(Arm64I8x16ExtractLaneS) \
@@ -316,6 +318,7 @@ namespace compiler {
316318
V(Arm64I8x16GeU) \
317319
V(Arm64I8x16RoundingAverageU) \
318320
V(Arm64I8x16Abs) \
321+
V(Arm64I8x16BitMask) \
319322
V(Arm64S128Zero) \
320323
V(Arm64S128Dup) \
321324
V(Arm64S128And) \

src/compiler/backend/arm64/instruction-scheduler-arm64.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
223223
case kArm64I32x4GtU:
224224
case kArm64I32x4GeU:
225225
case kArm64I32x4Abs:
226+
case kArm64I32x4BitMask:
226227
case kArm64I16x8Splat:
227228
case kArm64I16x8ExtractLaneU:
228229
case kArm64I16x8ExtractLaneS:
@@ -257,6 +258,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
257258
case kArm64I16x8GeU:
258259
case kArm64I16x8RoundingAverageU:
259260
case kArm64I16x8Abs:
261+
case kArm64I16x8BitMask:
260262
case kArm64I8x16Splat:
261263
case kArm64I8x16ExtractLaneU:
262264
case kArm64I8x16ExtractLaneS:
@@ -286,6 +288,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
286288
case kArm64I8x16GeU:
287289
case kArm64I8x16RoundingAverageU:
288290
case kArm64I8x16Abs:
291+
case kArm64I8x16BitMask:
289292
case kArm64S128Zero:
290293
case kArm64S128Dup:
291294
case kArm64S128And:

src/compiler/backend/arm64/instruction-selector-arm64.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3367,6 +3367,29 @@ VISIT_SIMD_QFMOP(F32x4Qfma)
33673367
VISIT_SIMD_QFMOP(F32x4Qfms)
33683368
#undef VISIT_SIMD_QFMOP
33693369

3370+
namespace {
3371+
template <ArchOpcode opcode>
3372+
void VisitBitMask(InstructionSelector* selector, Node* node) {
3373+
Arm64OperandGenerator g(selector);
3374+
InstructionOperand temps[] = {g.TempSimd128Register(),
3375+
g.TempSimd128Register()};
3376+
selector->Emit(opcode, g.DefineAsRegister(node),
3377+
g.UseRegister(node->InputAt(0)), arraysize(temps), temps);
3378+
}
3379+
} // namespace
3380+
3381+
void InstructionSelector::VisitI8x16BitMask(Node* node) {
3382+
VisitBitMask<kArm64I8x16BitMask>(this, node);
3383+
}
3384+
3385+
void InstructionSelector::VisitI16x8BitMask(Node* node) {
3386+
VisitBitMask<kArm64I16x8BitMask>(this, node);
3387+
}
3388+
3389+
void InstructionSelector::VisitI32x4BitMask(Node* node) {
3390+
VisitBitMask<kArm64I32x4BitMask>(this, node);
3391+
}
3392+
33703393
namespace {
33713394

33723395
struct ShuffleEntry {

src/compiler/backend/instruction-selector.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2025,6 +2025,8 @@ void InstructionSelector::VisitNode(Node* node) {
20252025
return MarkAsSimd128(node), VisitI32x4GeU(node);
20262026
case IrOpcode::kI32x4Abs:
20272027
return MarkAsSimd128(node), VisitI32x4Abs(node);
2028+
case IrOpcode::kI32x4BitMask:
2029+
return MarkAsWord32(node), VisitI32x4BitMask(node);
20282030
case IrOpcode::kI16x8Splat:
20292031
return MarkAsSimd128(node), VisitI16x8Splat(node);
20302032
case IrOpcode::kI16x8ExtractLaneU:
@@ -2093,6 +2095,8 @@ void InstructionSelector::VisitNode(Node* node) {
20932095
return MarkAsSimd128(node), VisitI16x8RoundingAverageU(node);
20942096
case IrOpcode::kI16x8Abs:
20952097
return MarkAsSimd128(node), VisitI16x8Abs(node);
2098+
case IrOpcode::kI16x8BitMask:
2099+
return MarkAsWord32(node), VisitI16x8BitMask(node);
20962100
case IrOpcode::kI8x16Splat:
20972101
return MarkAsSimd128(node), VisitI8x16Splat(node);
20982102
case IrOpcode::kI8x16ExtractLaneU:
@@ -2151,6 +2155,8 @@ void InstructionSelector::VisitNode(Node* node) {
21512155
return MarkAsSimd128(node), VisitI8x16RoundingAverageU(node);
21522156
case IrOpcode::kI8x16Abs:
21532157
return MarkAsSimd128(node), VisitI8x16Abs(node);
2158+
case IrOpcode::kI8x16BitMask:
2159+
return MarkAsWord32(node), VisitI8x16BitMask(node);
21542160
case IrOpcode::kS128Zero:
21552161
return MarkAsSimd128(node), VisitS128Zero(node);
21562162
case IrOpcode::kS128And:
@@ -2628,6 +2634,12 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
26282634
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
26292635
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X
26302636

2637+
#if !V8_TARGET_ARCH_ARM64
2638+
void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
2639+
void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
2640+
void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
2641+
#endif // !V8_TARGET_ARCH_ARM64
2642+
26312643
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
26322644

26332645
void InstructionSelector::VisitParameter(Node* node) {

src/compiler/machine-operator.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ MachineType AtomicOpType(Operator const* op) {
390390
V(I32x4GtU, Operator::kNoProperties, 2, 0, 1) \
391391
V(I32x4GeU, Operator::kNoProperties, 2, 0, 1) \
392392
V(I32x4Abs, Operator::kNoProperties, 1, 0, 1) \
393+
V(I32x4BitMask, Operator::kNoProperties, 1, 0, 1) \
393394
V(I16x8Splat, Operator::kNoProperties, 1, 0, 1) \
394395
V(I16x8SConvertI8x16Low, Operator::kNoProperties, 1, 0, 1) \
395396
V(I16x8SConvertI8x16High, Operator::kNoProperties, 1, 0, 1) \
@@ -421,6 +422,7 @@ MachineType AtomicOpType(Operator const* op) {
421422
V(I16x8GeU, Operator::kNoProperties, 2, 0, 1) \
422423
V(I16x8RoundingAverageU, Operator::kCommutative, 2, 0, 1) \
423424
V(I16x8Abs, Operator::kNoProperties, 1, 0, 1) \
425+
V(I16x8BitMask, Operator::kNoProperties, 1, 0, 1) \
424426
V(I8x16Splat, Operator::kNoProperties, 1, 0, 1) \
425427
V(I8x16Neg, Operator::kNoProperties, 1, 0, 1) \
426428
V(I8x16Shl, Operator::kNoProperties, 2, 0, 1) \
@@ -447,6 +449,7 @@ MachineType AtomicOpType(Operator const* op) {
447449
V(I8x16GeU, Operator::kNoProperties, 2, 0, 1) \
448450
V(I8x16RoundingAverageU, Operator::kCommutative, 2, 0, 1) \
449451
V(I8x16Abs, Operator::kNoProperties, 1, 0, 1) \
452+
V(I8x16BitMask, Operator::kNoProperties, 1, 0, 1) \
450453
V(S128Load, Operator::kNoProperties, 2, 0, 1) \
451454
V(S128Store, Operator::kNoProperties, 3, 0, 1) \
452455
V(S128Zero, Operator::kNoProperties, 0, 0, 1) \

src/compiler/machine-operator.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
630630
const Operator* I32x4GtU();
631631
const Operator* I32x4GeU();
632632
const Operator* I32x4Abs();
633+
const Operator* I32x4BitMask();
633634

634635
const Operator* I16x8Splat();
635636
const Operator* I16x8ExtractLaneU(int32_t);
@@ -666,6 +667,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
666667
const Operator* I16x8GeU();
667668
const Operator* I16x8RoundingAverageU();
668669
const Operator* I16x8Abs();
670+
const Operator* I16x8BitMask();
669671

670672
const Operator* I8x16Splat();
671673
const Operator* I8x16ExtractLaneU(int32_t);
@@ -697,6 +699,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
697699
const Operator* I8x16GeU();
698700
const Operator* I8x16RoundingAverageU();
699701
const Operator* I8x16Abs();
702+
const Operator* I8x16BitMask();
700703

701704
const Operator* S128Load();
702705
const Operator* S128Store();

src/compiler/opcodes.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,7 @@
840840
V(I32x4GtU) \
841841
V(I32x4GeU) \
842842
V(I32x4Abs) \
843+
V(I32x4BitMask) \
843844
V(I16x8Splat) \
844845
V(I16x8ExtractLaneU) \
845846
V(I16x8ExtractLaneS) \
@@ -878,6 +879,7 @@
878879
V(I16x8GeU) \
879880
V(I16x8RoundingAverageU) \
880881
V(I16x8Abs) \
882+
V(I16x8BitMask) \
881883
V(I8x16Splat) \
882884
V(I8x16ExtractLaneU) \
883885
V(I8x16ExtractLaneS) \
@@ -911,6 +913,7 @@
911913
V(I8x16GeU) \
912914
V(I8x16RoundingAverageU) \
913915
V(I8x16Abs) \
916+
V(I8x16BitMask) \
914917
V(S128Load) \
915918
V(S128Store) \
916919
V(S128Zero) \

src/compiler/wasm-compiler.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4368,6 +4368,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
43684368
inputs[1]);
43694369
case wasm::kExprI32x4Abs:
43704370
return graph()->NewNode(mcgraph()->machine()->I32x4Abs(), inputs[0]);
4371+
case wasm::kExprI32x4BitMask:
4372+
return graph()->NewNode(mcgraph()->machine()->I32x4BitMask(), inputs[0]);
43714373
case wasm::kExprI16x8Splat:
43724374
return graph()->NewNode(mcgraph()->machine()->I16x8Splat(), inputs[0]);
43734375
case wasm::kExprI16x8SConvertI8x16Low:
@@ -4470,6 +4472,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
44704472
inputs[0], inputs[1]);
44714473
case wasm::kExprI16x8Abs:
44724474
return graph()->NewNode(mcgraph()->machine()->I16x8Abs(), inputs[0]);
4475+
case wasm::kExprI16x8BitMask:
4476+
return graph()->NewNode(mcgraph()->machine()->I16x8BitMask(), inputs[0]);
44734477
case wasm::kExprI8x16Splat:
44744478
return graph()->NewNode(mcgraph()->machine()->I8x16Splat(), inputs[0]);
44754479
case wasm::kExprI8x16Neg:
@@ -4557,6 +4561,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
45574561
inputs[0], inputs[1]);
45584562
case wasm::kExprI8x16Abs:
45594563
return graph()->NewNode(mcgraph()->machine()->I8x16Abs(), inputs[0]);
4564+
case wasm::kExprI8x16BitMask:
4565+
return graph()->NewNode(mcgraph()->machine()->I8x16BitMask(), inputs[0]);
45604566
case wasm::kExprS128And:
45614567
return graph()->NewNode(mcgraph()->machine()->S128And(), inputs[0],
45624568
inputs[1]);

src/wasm/wasm-interpreter.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "src/wasm/wasm-limits.h"
2727
#include "src/wasm/wasm-module.h"
2828
#include "src/wasm/wasm-objects-inl.h"
29+
#include "src/wasm/wasm-opcodes.h"
2930
#include "src/zone/accounting-allocator.h"
3031
#include "src/zone/zone-containers.h"
3132

@@ -2379,6 +2380,26 @@ class ThreadImpl {
23792380
UNOP_CASE(I8x16Neg, i8x16, int16, 16, base::NegateWithWraparound(a))
23802381
UNOP_CASE(I8x16Abs, i8x16, int16, 16, std::abs(a))
23812382
#undef UNOP_CASE
2383+
2384+
// Cast to double in call to signbit is due to MSCV issue, see
2385+
// https://github.com/microsoft/STL/issues/519.
2386+
#define BITMASK_CASE(op, name, stype, count) \
2387+
case kExpr##op: { \
2388+
WasmValue v = Pop(); \
2389+
stype s = v.to_s128().to_##name(); \
2390+
int32_t res = 0; \
2391+
for (size_t i = 0; i < count; ++i) { \
2392+
bool sign = std::signbit(static_cast<double>(s.val[i])); \
2393+
res |= (sign << i); \
2394+
} \
2395+
Push(WasmValue(res)); \
2396+
return true; \
2397+
}
2398+
BITMASK_CASE(I8x16BitMask, i8x16, int16, 16)
2399+
BITMASK_CASE(I16x8BitMask, i16x8, int8, 8)
2400+
BITMASK_CASE(I32x4BitMask, i32x4, int4, 4)
2401+
#undef BITMASK_CASE
2402+
23822403
#define CMPOP_CASE(op, name, stype, out_stype, count, expr) \
23832404
case kExpr##op: { \
23842405
WasmValue v2 = Pop(); \

0 commit comments

Comments
 (0)