diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index f715e0ec8dbb4..5e15d8d56703e 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -180,9 +180,10 @@ class MemoryDepChecker {
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
-                   unsigned MaxTargetVectorWidthInBits)
+                   unsigned MaxTargetVectorWidthInBits, bool AllowNonPow2Deps)
       : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
-        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
+        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
+        AllowNonPow2Deps(AllowNonPow2Deps) {}
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -218,17 +219,28 @@ class MemoryDepChecker {
 
   /// Return true if there are no store-load forwarding dependencies.
   bool isSafeForAnyStoreLoadForwardDistances() const {
-    return MaxStoreLoadForwardSafeDistanceInBits ==
-           std::numeric_limits<uint64_t>::max();
+    return MaxPowerOf2StoreLoadForwardSafeDistanceInBits ==
+               std::numeric_limits<uint64_t>::max() &&
+           MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits ==
+               std::numeric_limits<uint64_t>::max();
   }
 
-  /// Return safe power-of-2 number of elements, which do not prevent store-load
-  /// forwarding, multiplied by the size of the elements in bits.
-  uint64_t getStoreLoadForwardSafeDistanceInBits() const {
+  /// Return safe number of elements, which do not prevent store-load
+  /// forwarding, multiplied by the size of the elements in bits (power-of-2).
+  uint64_t getPowerOf2StoreLoadForwardSafeDistanceInBits() const {
     assert(!isSafeForAnyStoreLoadForwardDistances() &&
            "Expected the distance, that prevent store-load forwarding, to be "
            "set.");
-    return MaxStoreLoadForwardSafeDistanceInBits;
+    return MaxPowerOf2StoreLoadForwardSafeDistanceInBits;
+  }
+
+  /// Return safe number of elements, which do not prevent store-load
+  /// forwarding, multiplied by the size of the elements in bits (power-of-2).
+  uint64_t getNonPowerOf2StoreLoadForwardSafeDistanceInBits() const {
+    assert(!isSafeForAnyStoreLoadForwardDistances() &&
+           "Expected the distance, that prevent store-load forwarding, to be "
+           "set.");
+    return MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits;
   }
 
   /// In same cases when the dependency check fails we can still
@@ -319,9 +331,14 @@ class MemoryDepChecker {
   /// restrictive.
   uint64_t MaxSafeVectorWidthInBits = -1U;
 
-  /// Maximum power-of-2 number of elements, which do not prevent store-load
-  /// forwarding, multiplied by the size of the elements in bits.
-  uint64_t MaxStoreLoadForwardSafeDistanceInBits =
+  /// Maximum number of elements, which do not prevent store-load forwarding,
+  /// multiplied by the size of the elements in bits (power-of-2).
+  uint64_t MaxPowerOf2StoreLoadForwardSafeDistanceInBits =
+      std::numeric_limits<uint64_t>::max();
+
+  /// Maximum number of elements, which do not prevent store-load forwarding,
+  /// multiplied by the size of the elements in bits (non-power-of-2).
+  uint64_t MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits =
       std::numeric_limits<uint64_t>::max();
 
   /// If we see a non-constant dependence distance we can still try to
@@ -348,6 +365,9 @@ class MemoryDepChecker {
   /// backwards-vectorizable or unknown (triggering a runtime check).
   unsigned MaxTargetVectorWidthInBits = 0;
 
+  /// True if current target supports non-power-of-2 dependence distances.
+  bool AllowNonPow2Deps = false;
+
   /// Mapping of SCEV expressions to their expanded pointer bounds (pair of
   /// start and end pointer expressions).
   DenseMap<std::pair<const SCEV *, Type *>,
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index d654ac3ec9273..465a3904c3f74 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -382,8 +382,7 @@ class LoopVectorizationLegality {
   const LoopAccessInfo *getLAI() const { return LAI; }
 
   bool isSafeForAnyVectorWidth() const {
-    return LAI->getDepChecker().isSafeForAnyVectorWidth() &&
-           LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
+    return LAI->getDepChecker().isSafeForAnyVectorWidth();
   }
 
   uint64_t getMaxSafeVectorWidthInBits() const {
@@ -414,8 +413,15 @@ class LoopVectorizationLegality {
 
   /// Return safe power-of-2 number of elements, which do not prevent store-load
   /// forwarding and safe to operate simultaneously.
-  uint64_t getMaxStoreLoadForwardSafeDistanceInBits() const {
-    return LAI->getDepChecker().getStoreLoadForwardSafeDistanceInBits();
+  uint64_t getPowerOf2MaxStoreLoadForwardSafeDistanceInBits() const {
+    return LAI->getDepChecker().getPowerOf2StoreLoadForwardSafeDistanceInBits();
+  }
+
+  /// Return safe non-power-of-2 number of elements, which do not prevent
+  /// store-load forwarding and safe to operate simultaneously.
+  uint64_t getNonPowerOf2MaxStoreLoadForwardSafeDistanceInBits() const {
+    return LAI->getDepChecker()
+        .getNonPowerOf2StoreLoadForwardSafeDistanceInBits();
   }
 
   /// Returns true if vector representation of the instruction \p I
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 9a7d361b5b512..33f5f75c81d67 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1756,7 +1756,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   // Maximum vector factor.
   uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
       std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
-               MaxStoreLoadForwardSafeDistanceInBits);
+               MaxPowerOf2StoreLoadForwardSafeDistanceInBits);
+  uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 = 0;
 
   // Compute the smallest VF at which the store and load would be misaligned.
   for (uint64_t VF = 2 * TypeByteSize;
@@ -1768,24 +1769,61 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
       break;
     }
   }
+  // RISCV VLA supports non-power-2 vector factor. So, we iterate in a
+  // backward order to find largest VF, which allows aligned stores-loads or
+  // the number of iterations between conflicting memory addresses is not less
+  // than 8 (NumItersForStoreLoadThroughMemory).
+  if (AllowNonPow2Deps) {
+    MaxVFWithoutSLForwardIssuesNonPowerOf2 =
+        std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
+                 MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits);
+
+    for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2;
+         VF > MaxVFWithoutSLForwardIssuesPowerOf2; VF -= TypeByteSize) {
+      if (Distance % VF == 0 ||
+          Distance / VF >= NumItersForStoreLoadThroughMemory) {
+        uint64_t GCD =
+            isSafeForAnyStoreLoadForwardDistances()
+                ? VF
+                : std::gcd(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits,
+                           VF);
+        MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
+        break;
+      }
+    }
+  }
 
-  if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
+  if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
+      MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
     LLVM_DEBUG(
         dbgs() << "LAA: Distance " << Distance
                << " that could cause a store-load forwarding conflict\n");
     return true;
   }
 
+  // Handle non-power-2 store-load forwarding distance, power-of-2 distance can
+  // be calculated.
+  if (AllowNonPow2Deps && CommonStride &&
+      MaxVFWithoutSLForwardIssuesNonPowerOf2 <
+          MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits &&
+      MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
+          VectorizerParams::MaxVectorWidth * TypeByteSize) {
+    uint64_t MaxVF = MaxVFWithoutSLForwardIssuesNonPowerOf2 / CommonStride;
+    uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
+    MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits =
+        std::min(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits);
+  }
+
   if (CommonStride &&
       MaxVFWithoutSLForwardIssuesPowerOf2 <
-          MaxStoreLoadForwardSafeDistanceInBits &&
+          MaxPowerOf2StoreLoadForwardSafeDistanceInBits &&
       MaxVFWithoutSLForwardIssuesPowerOf2 !=
           VectorizerParams::MaxVectorWidth * TypeByteSize) {
     uint64_t MaxVF =
         bit_floor(MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
     uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
-    MaxStoreLoadForwardSafeDistanceInBits =
-        std::min(MaxStoreLoadForwardSafeDistanceInBits, MaxVFInBits);
+    MaxPowerOf2StoreLoadForwardSafeDistanceInBits =
+        std::min(MaxPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits);
   }
   return false;
 }
@@ -2250,7 +2288,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::Unknown;
   }
 
-  MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
+  if (!AllowNonPow2Deps)
+    MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
   return Dependence::BackwardVectorizable;
 }
 
@@ -2984,8 +3023,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
     MaxTargetVectorWidthInBits =
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
-  DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
-                                                  MaxTargetVectorWidthInBits);
+  DepChecker = std::make_unique<MemoryDepChecker>(
+      *PSE, L, SymbolicStrides, MaxTargetVectorWidthInBits,
+      TTI && TTI->hasActiveVectorLength(0, nullptr, Align()));
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop())
     CanVecMem = analyzeLoop(AA, LI, TLI, DT);
@@ -2999,7 +3039,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
       OS << " with a maximum safe vector width of "
          << DC.getMaxSafeVectorWidthInBits() << " bits";
     if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
-      uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
+      uint64_t SLDist = DC.getNonPowerOf2StoreLoadForwardSafeDistanceInBits();
+      if (SLDist == std::numeric_limits<uint64_t>::max())
+        SLDist = DC.getPowerOf2StoreLoadForwardSafeDistanceInBits();
       OS << ", with a maximum safe store-load forward width of " << SLDist
          << " bits";
     }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 36e14bb27a029..b5689cd38cb3d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1559,9 +1559,10 @@ class LoopVectorizationCostModel {
   /// elements is a power-of-2 larger than zero. If scalable vectorization is
   /// disabled or unsupported, then the scalable part will be equal to
   /// ElementCount::getScalable(0).
-  FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
-                                           ElementCount UserVF,
-                                           bool FoldTailByMasking);
+  FixedScalableVFPair
+  computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF,
+                       bool FoldTailByMasking,
+                       bool AllowNonPowerOf2SafeDist = false);
 
   /// \return the maximized element count based on the targets vector
   /// registers and the loop trip-count, but limited to a maximum safe VF.
@@ -3751,7 +3752,9 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
     return false;
   }
 
-  if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
+  if ((!Legal->isSafeForAnyVectorWidth() ||
+       !Legal->isSafeForAnyStoreLoadForwardDistances()) &&
+      !getMaxVScale(*TheFunction, TTI)) {
     reportVectorizationInfo("The target does not provide maximum vscale value "
                             "for safe distance analysis.",
                             "ScalableVFUnfeasible", ORE, TheLoop);
@@ -3769,7 +3772,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
 
   auto MaxScalableVF = ElementCount::getScalable(
       std::numeric_limits<ElementCount::ScalarTy>::max());
-  if (Legal->isSafeForAnyVectorWidth())
+  if (Legal->isSafeForAnyVectorWidth() &&
+      Legal->isSafeForAnyStoreLoadForwardDistances())
     return MaxScalableVF;
 
   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
@@ -3786,7 +3790,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
 }
 
 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
-    unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
+    unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking,
+    bool AllowNonPowerOf2SafeDist) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -3795,18 +3800,32 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
   // the memory accesses that is most restrictive (involved in the smallest
   // dependence distance).
-  unsigned MaxSafeElementsPowerOf2 =
-      bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+  unsigned MaxSafeElementsNonPowerOf2 =
+      Legal->getMaxSafeVectorWidthInBits() / WidestType;
+  unsigned MaxSafeElementsPowerOf2 = bit_floor(MaxSafeElementsNonPowerOf2);
   if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
-    unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
-    MaxSafeElementsPowerOf2 =
-        std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
+    uint64_t SLDist = Legal->getPowerOf2MaxStoreLoadForwardSafeDistanceInBits();
+    if (SLDist != std::numeric_limits<uint64_t>::max()) {
+      unsigned SLVF = SLDist / WidestType;
+      MaxSafeElementsPowerOf2 = std::min(MaxSafeElementsPowerOf2, SLVF);
+    }
+    if (FoldTailByMasking && AllowNonPowerOf2SafeDist) {
+      uint64_t SLDist =
+          Legal->getNonPowerOf2MaxStoreLoadForwardSafeDistanceInBits();
+      if (SLDist != std::numeric_limits<uint64_t>::max()) {
+        unsigned SLVF = SLDist / WidestType;
+        MaxSafeElements = Legal->isSafeForAnyVectorWidth()
+                              ? SLVF
+                              : std::gcd(MaxSafeElementsNonPowerOf2, SLVF);
+      }
+    } else {
+      MaxSafeElements = MaxSafeElementsPowerOf2;
+    }
   }
   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
-  auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
-
-  if (!Legal->isSafeForAnyVectorWidth())
-    this->MaxSafeElements = MaxSafeElementsPowerOf2;
+  auto MaxSafeScalableVF = getMaxLegalScalableVF(
+      FoldTailByMasking && AllowNonPowerOf2SafeDist ? MaxSafeElementsNonPowerOf2
+                                                    : MaxSafeElementsPowerOf2);
 
   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
                     << ".\n");
@@ -4018,7 +4037,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return Rem->isZero();
   };
 
-  if (MaxPowerOf2RuntimeVF > 0u) {
+  FixedScalableVFPair FoldTailMaxFactors =
+      computeFeasibleMaxVF(MaxTC, UserVF, /*FoldTailByMasking=*/true,
+                           /*AllowNonPowerOf2SafeDist=*/true);
+  if ((Legal->isSafeForAnyStoreLoadForwardDistances() ||
+       has_single_bit(*MaxSafeElements)) &&
+      MaxPowerOf2RuntimeVF) {
     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
            "MaxFixedVF must be a power of 2");
     if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
@@ -4030,7 +4054,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
   auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
   if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
-    if (MaxPowerOf2RuntimeVF > 0u) {
+    if (MaxPowerOf2RuntimeVF) {
+      assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
+             "MaxFixedVF must be a power of 2");
+      if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
+        // Accept MaxFixedVF if we do not have a tail.
+        LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+        return MaxFactors;
+      }
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the fixed-width
       // factor in preference to allow the generation of a non-predicated loop.
@@ -4054,10 +4085,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
+  bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero() ||
+                            FoldTailMaxFactors.ScalableVF.isNonZero();
   setTailFoldingStyles(ContainsScalableVF, UserIC);
   if (foldTailByMasking()) {
-    if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
+    if (foldTailWithEVL()) {
       LLVM_DEBUG(
           dbgs()
           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
@@ -4069,6 +4101,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       assert(ContainsScalableVF && "Expected scalable vector factor.");
 
       MaxFactors.FixedVF = ElementCount::getFixed(1);
+      MaxFactors.ScalableVF = FoldTailMaxFactors.ScalableVF;
     }
     return MaxFactors;
   }
@@ -5137,7 +5170,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   }
 
   // We used the distance for the interleave count.
-  if (!Legal->isSafeForAnyVectorWidth())
+  if (!Legal->isSafeForAnyVectorWidth() ||
+      !Legal->isSafeForAnyStoreLoadForwardDistances())
     return 1;
 
   // We don't attempt to perform interleaving for loops with uncountable early
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
index fb2ec8c61d745..8886cd6c12892 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -102,23 +102,38 @@ define void @test_may_clobber1(ptr %p) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP9]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 100
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 200, [[TMP3]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[AVL]], 50
+; IF-EVL-NEXT:    [[SAFE_AVL:%.*]] = select i1 [[TMP12]], i64 [[AVL]], i64 50
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add i64 [[TMP3]], 100
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[TMP3]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -135,36 +150,17 @@ define void @test_may_clobber1(ptr %p) {
 ;
 ; NO-VP-LABEL: @test_may_clobber1(
 ; NO-VP-NEXT:  entry:
-; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-VP:       vector.ph:
-; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-VP:       vector.body:
-; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
-; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 100
-; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; NO-VP-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NO-VP:       middle.block:
-; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; NO-VP:       scalar.ph:
-; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
 ; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
 ; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
 ; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
 ; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -189,17 +185,51 @@ exit:
 define void @test_may_clobber2(ptr %p) {
 ; IF-EVL-LABEL: @test_may_clobber2(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[AVL]], 9
+; IF-EVL-NEXT:    [[SAFE_AVL:%.*]] = select i1 [[TMP5]], i64 [[AVL]], i64 9
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 9
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br label [[EXIT:%.*]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP1:%.*]]
 ; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP1]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV1]]
 ; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 9
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV1]], 9
 ; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; IF-EVL-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV1]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -242,23 +272,38 @@ define void @test_may_clobber3(ptr %p) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP9]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 10
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 200, [[TMP3]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[AVL]], 10
+; IF-EVL-NEXT:    [[SAFE_AVL:%.*]] = select i1 [[TMP12]], i64 [[AVL]], i64 10
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add i64 [[TMP3]], 10
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[TMP3]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -269,42 +314,23 @@ define void @test_may_clobber3(ptr %p) {
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
 ; NO-VP-LABEL: @test_may_clobber3(
 ; NO-VP-NEXT:  entry:
-; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-VP:       vector.ph:
-; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-VP:       vector.body:
-; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
-; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 10
-; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; NO-VP-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; NO-VP:       middle.block:
-; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; NO-VP:       scalar.ph:
-; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
 ; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
 ; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
 ; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
 ; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -357,7 +383,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
@@ -372,7 +398,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -417,31 +443,31 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP13]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]]
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP13]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP8]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024
-; IF-EVL-NEXT:    [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024
-; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true)
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP10]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = add i64 [[EVL_BASED_IV]], 1024
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP6]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP6]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add i64 [[EVL_BASED_IV]], 1024
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
@@ -456,7 +482,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -497,11 +523,51 @@ exit:
 define void @non-power-2-storeloadforward(ptr %A) {
 ; IF-EVL-LABEL: @non-power-2-storeloadforward(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP19]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 112, [[TMP20]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP19]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP21]], 4
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 112, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[AVL]], 3
+; IF-EVL-NEXT:    [[SAFE_AVL:%.*]] = select i1 [[TMP5]], i64 [[AVL]], i64 3
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 16, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -3
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add nsw i64 [[OFFSET_IDX]], 4
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ]
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IV]], -3
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = add nsw i64 [[IV]], 4
 ; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
@@ -512,7 +578,7 @@ define void @non-power-2-storeloadforward(ptr %A) {
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY1]], label [[FOR_END]], !llvm.loop [[LOOP15:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index b0b69c74a2299..6170144163441 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -3,12 +3,12 @@
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -force-tail-folding-style=data-with-evl \
 ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
-; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -force-tail-folding-style=none \
 ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
-; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
@@ -96,30 +96,62 @@ for.cond.cleanup:
 }
 
 define void @safe_dep(ptr %p) {
-; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
-; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<512> = original trip-count
-; CHECK-EMPTY:
-; CHECK:      vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
-; CHECK-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; CHECK-NEXT:    WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]>
-; CHECK-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
-; CHECK-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
-; CHECK-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; CHECK-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
-; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
-; CHECK-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
-; CHECK-NEXT:  No successors
-; CHECK-NEXT: }
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VF:%[0-9]+]]> = VF
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<512> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[EVL_NEXT:%.+]]>
+; IF-EVL-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<512>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[CMP:%[0-9]+]]> = icmp ult vp<[[AVL]]>, ir<50>
+; IF-EVL-NEXT:    EMIT vp<[[SAFE_AVL:%.+]]> = select vp<[[CMP]]>, vp<[[AVL]]>, ir<50>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%[0-9]+]]> = EXPLICIT-VECTOR-LENGTH vp<[[SAFE_AVL]]>
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[VF]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[V:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN vp.store vp<[[PTR2]]>, ir<[[V]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT vp<[[EVL_NEXT]]> = add nuw vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+;
+; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<512> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
 
 entry:
   br label %loop