Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -480,13 +480,13 @@ INST3(aeskeygenassist, "vaeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE,
INST3(pclmulqdq, "vpclmulqdq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), 7C, 1C, INS_TT_FULL_MEM, KMask_Base1 | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords

// Instructions for SHA
INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords
INST3(sha1msg2, "sha1msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCA), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform a Final Calculation for the Next Four SHA1 Message Dwords
INST3(sha1nexte, "sha1nexte", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC8), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Calculate SHA1 State Variable E After Four Rounds
INST3(sha1rnds4, "sha1rnds4", IUM_RW, BAD_CODE, BAD_CODE, SSE3A(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform Four Rounds of SHA1 Operation
INST3(sha256msg1, "sha256msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords
INST3(sha256msg2, "sha256msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCD), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform a Final Calculation for the Next Four SHA256 Message Dwords
INST3(sha256rnds2, "sha256rnds2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCB), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform Two Rounds of SHA256 Operation
INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords
INST3(sha1msg2, "sha1msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCA), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA1 Message Dwords
INST3(sha1nexte, "sha1nexte", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Calculate SHA1 State Variable E After Four Rounds
INST3(sha1rnds4, "sha1rnds4", IUM_RW, BAD_CODE, BAD_CODE, SSE3A(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform Four Rounds of SHA1 Operation
INST3(sha256msg1, "sha256msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords
INST3(sha256msg2, "sha256msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA256 Message Dwords
INST3(sha256rnds2, "sha256rnds2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCB), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform Two Rounds of SHA256 Operation

// Instructions for GFNI
INST3(gf2p8affineinvqb, "vgf2p8affineinvqb",IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xCF), 5C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Galois Field Affine Transformation Inverse
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class Lowering final : public Phase
void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr, unsigned size);
void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node);
#ifdef TARGET_XARCH
void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode);
void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* cnsVec);
#endif // TARGET_XARCH
#endif // FEATURE_HW_INTRINSICS

Expand Down
217 changes: 119 additions & 98 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2659,19 +2659,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
return LowerHWIntrinsicTernaryLogic(node);
}

case NI_GFNI_GaloisFieldAffineTransform:
case NI_GFNI_GaloisFieldAffineTransformInverse:
case NI_GFNI_V256_GaloisFieldAffineTransform:
case NI_GFNI_V256_GaloisFieldAffineTransformInverse:
case NI_GFNI_V512_GaloisFieldAffineTransform:
case NI_GFNI_V512_GaloisFieldAffineTransformInverse:
{
// Managed API surfaces these with only UBYTE operands.
// We retype in order to support EVEX embedded broadcast of op2
node->SetSimdBaseJitType(CORINFO_TYPE_ULONG);
break;
}

default:
break;
}
Expand Down Expand Up @@ -3225,7 +3212,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm

GenTree* vecCns =
comp->gtNewSimdCreateBroadcastNode(simdType, broadcastOp,
op1Intrinsic->GetSimdBaseJitType(), simdSize);
nestedIntrin->GetSimdBaseJitType(), simdSize);

assert(vecCns->IsCnsVec());
BlockRange().InsertAfter(broadcastOp, vecCns);
Expand Down Expand Up @@ -9356,126 +9343,160 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
//
// Arguments:
// parentNode - The hardware intrinsic node
// childNode - The operand node to try contain
// cnsVec - The constant vector to contain
//
void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode)
void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* cnsVec)
{
assert(!childNode->IsAllBitsSet());
assert(!childNode->IsZero());
assert(!cnsVec->IsAllBitsSet());
assert(!cnsVec->IsZero());

if (!comp->canUseEmbeddedBroadcast())
{
MakeSrcContained(parentNode, childNode);
MakeSrcContained(parentNode, cnsVec);
return;
}

// We use the child node's size for the broadcast node, because the parent may consume more than its own size.
// The containment check has already validated that the child is sufficiently large.
// Regardless of how the constant vector was created, we can convert it to an embedded broadcast if it
// repeats correctly for the instruction's broadcast element size.
//
// We use the parent node's base type, because we must ensure that the constant repeats correctly for that size,
// regardless of how the constant vector was created.
// Likewise, we do not have to match the intrinsic's base type as long as the broadcast size is correct.

var_types simdType = childNode->TypeGet();
var_types simdBaseType = parentNode->GetSimdBaseType();
CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType();
bool isCreatedFromScalar = true;
var_types simdBaseType = parentNode->GetSimdBaseType();
CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType();
instruction ins = HWIntrinsicInfo::lookupIns(parentNode->GetHWIntrinsicId(), simdBaseType, comp);
unsigned broadcastSize = CodeGenInterface::instInputSize(ins);

if (varTypeIsSmall(simdBaseType))
if (broadcastSize > genTypeSize(simdBaseType))
{
isCreatedFromScalar = false;
}
else
{
isCreatedFromScalar = childNode->IsBroadcast(simdBaseType);
if (broadcastSize == 4)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else
{
assert(broadcastSize == 8);
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}
}

if (isCreatedFromScalar)
if (!cnsVec->IsBroadcast(simdBaseType))
{
NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128;
if (simdType == TYP_SIMD32)
bool canUse8ByteBroadcast = false;

if (broadcastSize == 4)
{
broadcastName = NI_AVX2_BroadcastScalarToVector256;
// Some bit-wise instructions have both 4-byte and 8-byte broadcast variants. We prefer the smallest
// possible broadcast because that makes the data section smaller, but if the constant isn't a match
// at 4 bytes, it might be at 8.

switch (ins)
{
case INS_andps:
case INS_andnps:
case INS_orps:
case INS_xorps:
case INS_pandd:
case INS_pandnd:
case INS_pord:
case INS_pxord:
case INS_vpternlogd:
case INS_vshuff32x4:
case INS_vshufi32x4:
canUse8ByteBroadcast = cnsVec->IsBroadcast(TYP_LONG);
break;

default:
break;
}
}
else if (simdType == TYP_SIMD64)

if (canUse8ByteBroadcast)
{
broadcastName = NI_AVX512_BroadcastScalarToVector512;
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}
else
{
assert(simdType == TYP_SIMD16);
MakeSrcContained(parentNode, cnsVec);
return;
}
}

GenTree* constScalar = nullptr;
switch (simdBaseType)
// We use the original constant vector's size for the broadcast node, because the parent node may consume
// more than its own size. The containment check has already validated that the constant is sufficiently
// large, but that check will be asserted again at codegen, so the replacement must also satisfy.

var_types simdType = cnsVec->TypeGet();
NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128;
GenTree* constScalar = nullptr;

if (simdType == TYP_SIMD32)
{
broadcastName = NI_AVX2_BroadcastScalarToVector256;
}
else if (simdType == TYP_SIMD64)
{
broadcastName = NI_AVX512_BroadcastScalarToVector512;
}
else
{
assert(simdType == TYP_SIMD16);
}

switch (simdBaseType)
{
case TYP_FLOAT:
{
case TYP_FLOAT:
{
float scalar = childNode->gtSimdVal.f32[0];
constScalar = comp->gtNewDconNodeF(scalar);
break;
}
case TYP_DOUBLE:
{
double scalar = childNode->gtSimdVal.f64[0];
constScalar = comp->gtNewDconNodeD(scalar);
break;
}
case TYP_INT:
{
int32_t scalar = childNode->gtSimdVal.i32[0];
constScalar = comp->gtNewIconNode(scalar, simdBaseType);
break;
}
case TYP_UINT:
{
uint32_t scalar = childNode->gtSimdVal.u32[0];
constScalar = comp->gtNewIconNode(scalar, TYP_INT);
break;
}
case TYP_LONG:
case TYP_ULONG:
{
int64_t scalar = childNode->gtSimdVal.i64[0];
constScalar = comp->gtNewLconNode(scalar);
break;
}
default:
unreached();
float scalar = cnsVec->gtSimdVal.f32[0];
constScalar = comp->gtNewDconNodeF(scalar);
break;
}

GenTreeHWIntrinsic* createScalar =
comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType,
16);
GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName,
simdBaseJitType, genTypeSize(simdType));
BlockRange().InsertBefore(childNode, broadcastNode);
BlockRange().InsertBefore(broadcastNode, createScalar);
BlockRange().InsertBefore(createScalar, constScalar);
LIR::Use use;
if (BlockRange().TryGetUse(childNode, &use))
case TYP_DOUBLE:
{
use.ReplaceWith(broadcastNode);
double scalar = cnsVec->gtSimdVal.f64[0];
constScalar = comp->gtNewDconNodeD(scalar);
break;
}
else

case TYP_INT:
case TYP_UINT:
{
broadcastNode->SetUnusedValue();
int32_t scalar = cnsVec->gtSimdVal.i32[0];
constScalar = comp->gtNewIconNode(scalar);
break;
}

BlockRange().Remove(childNode);
LowerNode(createScalar);
LowerNode(broadcastNode);
if (varTypeIsFloating(simdBaseType))
case TYP_LONG:
case TYP_ULONG:
{
MakeSrcContained(broadcastNode, createScalar);
int64_t scalar = cnsVec->gtSimdVal.i64[0];
constScalar = comp->gtNewLconNode(scalar);
break;
}
else if (constScalar->TypeIs(TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG))

default:
{
MakeSrcContained(broadcastNode, constScalar);
unreached();
}
MakeSrcContained(parentNode, broadcastNode);
return;
}
MakeSrcContained(parentNode, childNode);

GenTreeHWIntrinsic* broadcastNode =
comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, simdBaseJitType, genTypeSize(simdType));

BlockRange().InsertBefore(parentNode, constScalar, broadcastNode);
BlockRange().Remove(cnsVec);

GenTree** use = nullptr;
bool useFound = parentNode->TryGetUse(cnsVec, &use);
assert(useFound);

parentNode->ReplaceOperand(use, broadcastNode);

MakeSrcContained(broadcastNode, constScalar);
MakeSrcContained(parentNode, broadcastNode);
}

//------------------------------------------------------------------------
Expand Down
Loading