Skip to content

Commit e2a72fa

Browse files
authored
[VPlan] Introduce recipes for VP loads and stores. (#87816)
Introduce new subclasses of VPWidenMemoryRecipe for VP (vector-predicated) loads and stores to address multiple TODOs from #76172 Note that the introduction of the new recipes also improves code-gen for VP gather/scatters by removing the redundant header mask. With the new approach, it is not sufficient to look at users of the widened canonical IV to find all uses of the header mask. In some cases, a widened IV is used instead of separately widening the canonical IV. To handle that, first collect all VPValues representing header masks (by looking at users of both the canonical IV and widened inductions that are canonical) and then checking all users (recursively) of those header masks. Depends on #87411. PR: #87816
1 parent 851462f commit e2a72fa

File tree

8 files changed

+283
-181
lines changed

8 files changed

+283
-181
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 96 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -9324,52 +9324,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
93249324
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
93259325
}
93269326

9327-
/// Creates either vp_store or vp_scatter intrinsics calls to represent
9328-
/// predicated store/scatter.
9329-
static Instruction *
9330-
lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
9331-
Value *StoredVal, bool IsScatter, Value *Mask,
9332-
Value *EVL, const Align &Alignment) {
9333-
CallInst *Call;
9334-
if (IsScatter) {
9335-
Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9336-
Intrinsic::vp_scatter,
9337-
{StoredVal, Addr, Mask, EVL});
9338-
} else {
9339-
VectorBuilder VBuilder(Builder);
9340-
VBuilder.setEVL(EVL).setMask(Mask);
9341-
Call = cast<CallInst>(VBuilder.createVectorInstruction(
9342-
Instruction::Store, Type::getVoidTy(EVL->getContext()),
9343-
{StoredVal, Addr}));
9344-
}
9345-
Call->addParamAttr(
9346-
1, Attribute::getWithAlignment(Call->getContext(), Alignment));
9347-
return Call;
9348-
}
9349-
9350-
/// Creates either vp_load or vp_gather intrinsics calls to represent
9351-
/// predicated load/gather.
9352-
static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
9353-
VectorType *DataTy,
9354-
Value *Addr, bool IsGather,
9355-
Value *Mask, Value *EVL,
9356-
const Align &Alignment) {
9357-
CallInst *Call;
9358-
if (IsGather) {
9359-
Call =
9360-
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9361-
nullptr, "wide.masked.gather");
9362-
} else {
9363-
VectorBuilder VBuilder(Builder);
9364-
VBuilder.setEVL(EVL).setMask(Mask);
9365-
Call = cast<CallInst>(VBuilder.createVectorInstruction(
9366-
Instruction::Load, DataTy, Addr, "vp.op.load"));
9367-
}
9368-
Call->addParamAttr(
9369-
0, Attribute::getWithAlignment(Call->getContext(), Alignment));
9370-
return Call;
9371-
}
9372-
93739327
void VPWidenLoadRecipe::execute(VPTransformState &State) {
93749328
auto *LI = cast<LoadInst>(&Ingredient);
93759329

@@ -9391,48 +9345,62 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
93919345
Mask = Builder.CreateVectorReverse(Mask, "reverse");
93929346
}
93939347

9394-
// TODO: split this into several classes for better design.
9395-
if (State.EVL) {
9396-
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9397-
"explicit vector length.");
9398-
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9399-
VPInstruction::ExplicitVectorLength &&
9400-
"EVL must be VPInstruction::ExplicitVectorLength.");
9401-
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9402-
// If EVL is not nullptr, then EVL must be a valid value set during plan
9403-
// creation, possibly default value = whole vector register length. EVL
9404-
// is created only if TTI prefers predicated vectorization, thus if EVL
9405-
// is not nullptr it also implies preference for predicated
9406-
// vectorization.
9407-
// FIXME: Support reverse loading after vp_reverse is added.
9408-
NewLI = lowerLoadUsingVectorIntrinsics(
9409-
Builder, DataTy, State.get(getAddr(), Part, !CreateGather),
9410-
CreateGather, Mask, EVL, Alignment);
9411-
} else if (CreateGather) {
9412-
Value *VectorGep = State.get(getAddr(), Part);
9413-
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, Mask,
9414-
nullptr, "wide.masked.gather");
9415-
State.addMetadata(NewLI, LI);
9348+
Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9349+
if (CreateGather) {
9350+
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9351+
"wide.masked.gather");
9352+
} else if (Mask) {
9353+
NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9354+
PoisonValue::get(DataTy),
9355+
"wide.masked.load");
94169356
} else {
9417-
auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9418-
if (Mask)
9419-
NewLI = Builder.CreateMaskedLoad(DataTy, VecPtr, Alignment, Mask,
9420-
PoisonValue::get(DataTy),
9421-
"wide.masked.load");
9422-
else
9423-
NewLI =
9424-
Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9425-
9426-
// Add metadata to the load, but setVectorValue to the reverse shuffle.
9427-
State.addMetadata(NewLI, LI);
9428-
if (Reverse)
9429-
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9357+
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
94309358
}
9431-
9359+
// Add metadata to the load, but setVectorValue to the reverse shuffle.
9360+
State.addMetadata(NewLI, LI);
9361+
if (Reverse)
9362+
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
94329363
State.set(this, NewLI, Part);
94339364
}
94349365
}
94359366

9367+
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
9368+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9369+
"explicit vector length.");
9370+
// FIXME: Support reverse loading after vp_reverse is added.
9371+
assert(!isReverse() && "Reverse loads are not implemented yet.");
9372+
9373+
auto *LI = cast<LoadInst>(&Ingredient);
9374+
9375+
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9376+
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9377+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
9378+
bool CreateGather = !isConsecutive();
9379+
9380+
auto &Builder = State.Builder;
9381+
State.setDebugLocFrom(getDebugLoc());
9382+
CallInst *NewLI;
9383+
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9384+
Value *Addr = State.get(getAddr(), 0, !CreateGather);
9385+
Value *Mask =
9386+
getMask() ? State.get(getMask(), 0)
9387+
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9388+
if (CreateGather) {
9389+
NewLI =
9390+
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9391+
nullptr, "wide.masked.gather");
9392+
} else {
9393+
VectorBuilder VBuilder(Builder);
9394+
VBuilder.setEVL(EVL).setMask(Mask);
9395+
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9396+
Instruction::Load, DataTy, Addr, "vp.op.load"));
9397+
}
9398+
NewLI->addParamAttr(
9399+
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9400+
State.addMetadata(NewLI, LI);
9401+
State.set(this, NewLI, 0);
9402+
}
9403+
94369404
void VPWidenStoreRecipe::execute(VPTransformState &State) {
94379405
auto *SI = cast<StoreInst>(&Ingredient);
94389406

@@ -9456,45 +9424,62 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
94569424

94579425
Value *StoredVal = State.get(StoredVPValue, Part);
94589426
if (isReverse()) {
9459-
assert(!State.EVL && "reversing not yet implemented with EVL");
94609427
// If we store to reverse consecutive memory locations, then we need
94619428
// to reverse the order of elements in the stored value.
94629429
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
94639430
// We don't want to update the value in the map as it might be used in
94649431
// another expression. So don't call resetVectorValue(StoredVal).
94659432
}
9466-
// TODO: split this into several classes for better design.
9467-
if (State.EVL) {
9468-
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9469-
"explicit vector length.");
9470-
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9471-
VPInstruction::ExplicitVectorLength &&
9472-
"EVL must be VPInstruction::ExplicitVectorLength.");
9473-
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9474-
// If EVL is not nullptr, then EVL must be a valid value set during plan
9475-
// creation, possibly default value = whole vector register length. EVL
9476-
// is created only if TTI prefers predicated vectorization, thus if EVL
9477-
// is not nullptr it also implies preference for predicated
9478-
// vectorization.
9479-
// FIXME: Support reverse store after vp_reverse is added.
9480-
NewSI = lowerStoreUsingVectorIntrinsics(
9481-
Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal,
9482-
CreateScatter, Mask, EVL, Alignment);
9483-
} else if (CreateScatter) {
9484-
Value *VectorGep = State.get(getAddr(), Part);
9485-
NewSI =
9486-
Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, Mask);
9487-
} else {
9488-
auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9489-
if (Mask)
9490-
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask);
9491-
else
9492-
NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9493-
}
9433+
Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9434+
if (CreateScatter)
9435+
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9436+
else if (Mask)
9437+
NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9438+
else
9439+
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
94949440
State.addMetadata(NewSI, SI);
94959441
}
94969442
}
94979443

9444+
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
9445+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9446+
"explicit vector length.");
9447+
// FIXME: Support reverse loading after vp_reverse is added.
9448+
assert(!isReverse() && "Reverse store are not implemented yet.");
9449+
9450+
auto *SI = cast<StoreInst>(&Ingredient);
9451+
9452+
VPValue *StoredValue = getStoredValue();
9453+
bool CreateScatter = !isConsecutive();
9454+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
9455+
9456+
auto &Builder = State.Builder;
9457+
State.setDebugLocFrom(getDebugLoc());
9458+
9459+
CallInst *NewSI = nullptr;
9460+
Value *StoredVal = State.get(StoredValue, 0);
9461+
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9462+
// FIXME: Support reverse store after vp_reverse is added.
9463+
Value *Mask =
9464+
getMask() ? State.get(getMask(), 0)
9465+
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9466+
Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9467+
if (CreateScatter) {
9468+
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9469+
Intrinsic::vp_scatter,
9470+
{StoredVal, Addr, Mask, EVL});
9471+
} else {
9472+
VectorBuilder VBuilder(Builder);
9473+
VBuilder.setEVL(EVL).setMask(Mask);
9474+
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9475+
Instruction::Store, Type::getVoidTy(EVL->getContext()),
9476+
{StoredVal, Addr}));
9477+
}
9478+
NewSI->addParamAttr(
9479+
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9480+
State.addMetadata(NewSI, SI);
9481+
}
9482+
94989483
// Determine how to lower the scalar epilogue, which depends on 1) optimising
94999484
// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
95009485
// predication, and 4) a TTI hook that analyses whether the loop is suitable

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 89 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,15 +242,6 @@ struct VPTransformState {
242242
ElementCount VF;
243243
unsigned UF;
244244

245-
/// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
246-
/// value set during plan transformation, possibly a default value = whole
247-
/// vector register length. EVL is created only if TTI prefers predicated
248-
/// vectorization, thus if EVL is not nullptr it also implies preference for
249-
/// predicated vectorization.
250-
/// TODO: this is a temporarily solution, the EVL must be explicitly used by
251-
/// the recipes and must be removed here.
252-
VPValue *EVL = nullptr;
253-
254245
/// Hold the indices to generate specific scalar instructions. Null indicates
255246
/// that all instances are to be generated, using either scalar or vector
256247
/// instructions.
@@ -875,7 +866,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
875866
return true;
876867
case VPRecipeBase::VPInterleaveSC:
877868
case VPRecipeBase::VPBranchOnMaskSC:
869+
case VPRecipeBase::VPWidenLoadEVLSC:
878870
case VPRecipeBase::VPWidenLoadSC:
871+
case VPRecipeBase::VPWidenStoreEVLSC:
879872
case VPRecipeBase::VPWidenStoreSC:
880873
// TODO: Widened stores don't define a value, but widened loads do. Split
881874
// the recipes to be able to make widened loads VPSingleDefRecipes.
@@ -2318,11 +2311,15 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
23182311
}
23192312

23202313
public:
2321-
VPWidenMemoryRecipe *clone() override = 0;
2314+
VPWidenMemoryRecipe *clone() override {
2315+
llvm_unreachable("cloning not supported");
2316+
}
23222317

23232318
static inline bool classof(const VPRecipeBase *R) {
2324-
return R->getVPDefID() == VPDef::VPWidenLoadSC ||
2325-
R->getVPDefID() == VPDef::VPWidenStoreSC;
2319+
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
2320+
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
2321+
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
2322+
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
23262323
}
23272324

23282325
static inline bool classof(const VPUser *U) {
@@ -2390,13 +2387,48 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
23902387
bool onlyFirstLaneUsed(const VPValue *Op) const override {
23912388
assert(is_contained(operands(), Op) &&
23922389
"Op must be an operand of the recipe");
2393-
23942390
// Widened, consecutive loads operations only demand the first lane of
23952391
// their address.
23962392
return Op == getAddr() && isConsecutive();
23972393
}
23982394
};
23992395

2396+
/// A recipe for widening load operations with vector-predication intrinsics,
2397+
/// using the address to load from, the explicit vector length and an optional
2398+
/// mask.
2399+
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
2400+
VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
2401+
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
2402+
{L->getAddr(), EVL}, L->isConsecutive(), false,
2403+
L->getDebugLoc()),
2404+
VPValue(this, &getIngredient()) {
2405+
setMask(Mask);
2406+
}
2407+
2408+
VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC)
2409+
2410+
/// Return the EVL operand.
2411+
VPValue *getEVL() const { return getOperand(1); }
2412+
2413+
/// Generate the wide load or gather.
2414+
void execute(VPTransformState &State) override;
2415+
2416+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2417+
/// Print the recipe.
2418+
void print(raw_ostream &O, const Twine &Indent,
2419+
VPSlotTracker &SlotTracker) const override;
2420+
#endif
2421+
2422+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2423+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2424+
assert(is_contained(operands(), Op) &&
2425+
"Op must be an operand of the recipe");
2426+
// Widened loads only demand the first lane of EVL and consecutive loads
2427+
// only demand the first lane of their address.
2428+
return Op == getEVL() || (Op == getAddr() && isConsecutive());
2429+
}
2430+
};
2431+
24002432
/// A recipe for widening store operations, using the stored value, the address
24012433
/// to store to and an optional mask.
24022434
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
@@ -2436,6 +2468,50 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
24362468
return Op == getAddr() && isConsecutive() && Op != getStoredValue();
24372469
}
24382470
};
2471+
2472+
/// A recipe for widening store operations with vector-predication intrinsics,
2473+
/// using the value to store, the address to store to, the explicit vector
2474+
/// length and an optional mask.
2475+
struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
2476+
VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
2477+
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
2478+
{S->getAddr(), S->getStoredValue(), EVL},
2479+
S->isConsecutive(), false, S->getDebugLoc()) {
2480+
setMask(Mask);
2481+
}
2482+
2483+
VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
2484+
2485+
/// Return the address accessed by this recipe.
2486+
VPValue *getStoredValue() const { return getOperand(1); }
2487+
2488+
/// Return the EVL operand.
2489+
VPValue *getEVL() const { return getOperand(2); }
2490+
2491+
/// Generate the wide store or scatter.
2492+
void execute(VPTransformState &State) override;
2493+
2494+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2495+
/// Print the recipe.
2496+
void print(raw_ostream &O, const Twine &Indent,
2497+
VPSlotTracker &SlotTracker) const override;
2498+
#endif
2499+
2500+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2501+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2502+
assert(is_contained(operands(), Op) &&
2503+
"Op must be an operand of the recipe");
2504+
if (Op == getEVL()) {
2505+
assert(getStoredValue() != Op && "unexpected store of EVL");
2506+
return true;
2507+
}
2508+
// Widened, consecutive memory operations only demand the first lane of
2509+
// their address, unless the same operand is also stored. That latter can
2510+
// happen with opaque pointers.
2511+
return Op == getAddr() && isConsecutive() && Op != getStoredValue();
2512+
}
2513+
};
2514+
24392515
/// Recipe to expand a SCEV expression.
24402516
class VPExpandSCEVRecipe : public VPSingleDefRecipe {
24412517
const SCEV *Expr;

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
109109
}
110110

111111
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
112-
assert(isa<VPWidenLoadRecipe>(R) &&
112+
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) &&
113113
"Store recipes should not define any values");
114114
return cast<LoadInst>(&R->getIngredient())->getType();
115115
}

0 commit comments

Comments
 (0)