@@ -32083,10 +32083,8 @@ bool GenTree::CanDivOrModPossiblyOverflow(Compiler* comp) const
3208332083#if defined(FEATURE_HW_INTRINSICS)
3208432084GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3208532085{
32086- if (!opts.Tier0OptimizationEnabled())
32087- {
32088- return tree;
32089- }
32086+ assert(!optValnumCSE_phase);
32087+ assert(opts.Tier0OptimizationEnabled());
3209032088
3209132089 NamedIntrinsic ni = tree->GetHWIntrinsicId();
3209232090 var_types retType = tree->TypeGet();
@@ -32225,6 +32223,133 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3222532223 // We shouldn't find AND_NOT nodes since it should only be produced in lowering
3222632224 assert(oper != GT_AND_NOT);
3222732225
32226+ #if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_XARCH)
32227+ if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
32228+ {
32229+ // Comparisons that produce masks lead to more verbose trees than
32230+ // necessary in many scenarios due to requiring a CvtMaskToVector
32231+ // node to be inserted over them and this can block various opts
32232+ // that are dependent on tree height and similar. So we want to
32233+ // fold the unnecessary back and forth conversions away where possible.
32234+
32235+ genTreeOps effectiveOper = oper;
32236+ GenTree* actualOp2 = op2;
32237+
32238+ if (oper == GT_NOT)
32239+ {
32240+ assert(op2 == nullptr);
32241+ op2 = op1;
32242+ }
32243+
32244+ // We need both operands to be ConvertMaskToVector in
32245+ // order to optimize this to a direct mask operation
32246+
32247+ if (!op1->OperIsConvertMaskToVector())
32248+ {
32249+ return tree;
32250+ }
32251+
32252+ if (!op2->OperIsHWIntrinsic())
32253+ {
32254+ if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet())
32255+ {
32256+ return tree;
32257+ }
32258+
32259+ // We want to explicitly recognize op1 ^ AllBitsSet as
32260+ // some platforms don't have direct support for ~op1
32261+
32262+ effectiveOper = GT_NOT;
32263+ op2 = op1;
32264+ }
32265+
32266+ GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic();
32267+ GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
32268+
32269+ if (!cvtOp2->OperIsConvertMaskToVector())
32270+ {
32271+ return tree;
32272+ }
32273+
32274+ unsigned simdBaseTypeSize = genTypeSize(simdBaseType);
32275+
32276+ if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) ||
32277+ (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
32278+ {
32279+ // We need both operands to be the same kind of mask; otherwise
32280+ // the bitwise operation can differ in how it performs
32281+ return tree;
32282+ }
32283+
32284+ NamedIntrinsic maskIntrinsicId = NI_Illegal;
32285+
32286+ switch (effectiveOper)
32287+ {
32288+ case GT_AND:
32289+ {
32290+ maskIntrinsicId = NI_AVX512_AndMask;
32291+ break;
32292+ }
32293+
32294+ case GT_NOT:
32295+ {
32296+ maskIntrinsicId = NI_AVX512_NotMask;
32297+ break;
32298+ }
32299+
32300+ case GT_OR:
32301+ {
32302+ maskIntrinsicId = NI_AVX512_OrMask;
32303+ break;
32304+ }
32305+
32306+ case GT_XOR:
32307+ {
32308+ maskIntrinsicId = NI_AVX512_XorMask;
32309+ break;
32310+ }
32311+
32312+ default:
32313+ {
32314+ unreached();
32315+ }
32316+ }
32317+
32318+ assert(maskIntrinsicId != NI_Illegal);
32319+
32320+ if (effectiveOper == oper)
32321+ {
32322+ tree->ChangeHWIntrinsicId(maskIntrinsicId);
32323+ tree->Op(1) = cvtOp1->Op(1);
32324+ }
32325+ else
32326+ {
32327+ assert(effectiveOper == GT_NOT);
32328+ tree->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1));
32329+ tree->gtFlags &= ~GTF_REVERSE_OPS;
32330+ }
32331+
32332+ tree->gtType = TYP_MASK;
32333+ DEBUG_DESTROY_NODE(op1);
32334+
32335+ if (effectiveOper != GT_NOT)
32336+ {
32337+ tree->Op(2) = cvtOp2->Op(1);
32338+ }
32339+
32340+ if (actualOp2 != nullptr)
32341+ {
32342+ DEBUG_DESTROY_NODE(actualOp2);
32343+ }
32344+ tree->SetMorphed(this);
32345+
32346+ tree = gtNewSimdCvtMaskToVectorNode(retType, tree, simdBaseJitType, simdSize)->AsHWIntrinsic();
32347+ tree->SetMorphed(this);
32348+
32349+ return tree;
32350+ }
32351+ #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_XARCH
32352+
3222832353 GenTree* cnsNode = nullptr;
3222932354 GenTree* otherNode = nullptr;
3223032355
@@ -32762,10 +32887,28 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3276232887 oper = GT_NONE;
3276332888 }
3276432889
32890+ // For mask nodes in particular, the foldings below are done under the presumption
32891+ // that we only produce something like `AddMask(op1, op2)` if op1 and op2 are compatible
32892+ // masks. On xarch, for example, this means that it'd be adding 8, 16, 32, or 64-bits
32893+ // together with the same size. We wouldn't ever encounter something like an 8 and 16 bit
32894+ // masks being added. This ensures that we don't end up with a case where folding would
32895+ // cause a different result to be produced, such as because the remaining upper bits are
32896+ // no longer zeroed.
32897+
3276532898 switch (oper)
3276632899 {
3276732900 case GT_ADD:
3276832901 {
32902+ if (varTypeIsMask(retType))
32903+ {
32904+ // Handle `x + 0 == x` and `0 + x == x`
32905+ if (cnsNode->IsMaskZero())
32906+ {
32907+ resultNode = otherNode;
32908+ }
32909+ break;
32910+ }
32911+
3276932912 if (varTypeIsFloating(simdBaseType))
3277032913 {
3277132914 // Handle `x + NaN == NaN` and `NaN + x == NaN`
@@ -32799,6 +32942,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3279932942
3280032943 case GT_AND:
3280132944 {
32945+ if (varTypeIsMask(retType))
32946+ {
32947+ // Handle `x & 0 == 0` and `0 & x == 0`
32948+ if (cnsNode->IsMaskZero())
32949+ {
32950+ resultNode = otherNode;
32951+ break;
32952+ }
32953+
32954+ // Handle `x & AllBitsSet == x` and `AllBitsSet & x == x`
32955+ if (cnsNode->IsMaskAllBitsSet())
32956+ {
32957+ resultNode = otherNode;
32958+ }
32959+ break;
32960+ }
32961+
3280232962 // Handle `x & 0 == 0` and `0 & x == 0`
3280332963 if (cnsNode->IsVectorZero())
3280432964 {
@@ -33032,6 +33192,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3303233192
3303333193 case GT_OR:
3303433194 {
33195+ if (varTypeIsMask(retType))
33196+ {
33197+ // Handle `x | 0 == x` and `0 | x == x`
33198+ if (cnsNode->IsMaskZero())
33199+ {
33200+ resultNode = otherNode;
33201+ break;
33202+ }
33203+
33204+ // Handle `x | AllBitsSet == AllBitsSet` and `AllBitsSet | x == AllBitsSet`
33205+ if (cnsNode->IsMaskAllBitsSet())
33206+ {
33207+ resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33208+ }
33209+ break;
33210+ }
33211+
3303533212 // Handle `x | 0 == x` and `0 | x == x`
3303633213 if (cnsNode->IsVectorZero())
3303733214 {
@@ -33059,6 +33236,27 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3305933236 // Handle `x >> 0 == x` and `0 >> x == 0`
3306033237 // Handle `x >>> 0 == x` and `0 >>> x == 0`
3306133238
33239+ if (varTypeIsMask(retType))
33240+ {
33241+ if (cnsNode->IsMaskZero())
33242+ {
33243+ if (cnsNode == op2)
33244+ {
33245+ resultNode = otherNode;
33246+ }
33247+ else
33248+ {
33249+ resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33250+ }
33251+ }
33252+ else if (cnsNode->IsIntegralConst(0))
33253+ {
33254+ assert(cnsNode == op2);
33255+ resultNode = otherNode;
33256+ }
33257+ break;
33258+ }
33259+
3306233260 if (cnsNode->IsVectorZero())
3306333261 {
3306433262 if (cnsNode == op2)
@@ -33104,7 +33302,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3310433302
3310533303 case GT_XOR:
3310633304 {
33107- // Handle `x | 0 == x` and `0 | x == x`
33305+ if (varTypeIsMask(retType))
33306+ {
33307+ // Handle `x ^ 0 == x` and `0 ^ x == x`
33308+ if (cnsNode->IsMaskZero())
33309+ {
33310+ resultNode = otherNode;
33311+ }
33312+ break;
33313+ }
33314+
33315+ // Handle `x ^ 0 == x` and `0 ^ x == x`
3310833316 if (cnsNode->IsVectorZero())
3310933317 {
3311033318 resultNode = otherNode;
@@ -33273,7 +33481,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3327333481 }
3327433482 else
3327533483 {
33276- assert(!op1->IsTrueMask(simdBaseType) && !op1->IsFalseMask ());
33484+ assert(!op1->IsTrueMask(simdBaseType) && !op1->IsMaskZero ());
3327733485 }
3327833486#endif
3327933487
@@ -33291,7 +33499,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3329133499 return op2;
3329233500 }
3329333501
33294- if (op1->IsVectorZero() || op1->IsFalseMask ())
33502+ if (op1->IsVectorZero() || op1->IsMaskZero ())
3329533503 {
3329633504 return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT);
3329733505 }
0 commit comments