aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-04 19:20:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-08 19:02:26 +0000
commit81ad626541db97eb356e2c1d4a20eb2a26a766ab (patch)
tree311b6a8987c32b1e1dcbab65c54cfac3fdb56175 /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent5fff09660e06a66bed6482da9c70df328e16bbb6 (diff)
parent145449b1e420787bb99721a429341fa6be3adfb6 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp133
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h22
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2091
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4331
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp1161
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h592
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp135
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h10
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h44
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp248
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h74
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp840
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp114
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h16
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h24
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp55
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp371
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp1
21 files changed, 6532 insertions, 3776 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97c2acb7d4c7..f59fc3a6dd60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -62,14 +62,13 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -497,7 +496,7 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
if (PtrDelta.urem(Stride) != 0)
return false;
unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
- APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+ APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
@@ -1298,10 +1297,16 @@ bool Vectorizer::vectorizeLoadChain(
CV->replaceAllUsesWith(V);
}
- // Bitcast might not be an Instruction, if the value being loaded is a
- // constant. In that case, no need to reorder anything.
- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
- reorder(BitcastInst);
+ // Since we might have opaque pointers we might end up using the pointer
+ // operand of the first load (wrt. memory loaded) for the vector load. Since
+ // this first load might not be the first in the block we potentially need to
+ // reorder the pointer operand (and its operands). If we have a bitcast though
+ // it might be before the load and should be the reorder start instruction.
+ // "Might" because for opaque pointers the "bitcast" is just the first loads
+ // pointer operand, as oppposed to something we inserted at the right position
+ // ourselves.
+ Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
+ reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
eraseInstructions(Chain);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 81e5aa223c07..6242d9a93fc1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -17,7 +17,9 @@
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -31,8 +33,6 @@ using namespace PatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
-extern cl::opt<bool> EnableVPlanPredication;
-
static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
@@ -439,6 +439,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
return false;
}
+/// Returns true if A and B have same pointer operands or same SCEVs addresses
+static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
+ StoreInst *B) {
+ // Compare store
+ if (A == B)
+ return true;
+
+ // Otherwise Compare pointers
+ Value *APtr = A->getPointerOperand();
+ Value *BPtr = B->getPointerOperand();
+ if (APtr == BPtr)
+ return true;
+
+ // Otherwise compare address SCEVs
+ if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
+ return true;
+
+ return false;
+}
+
int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
Value *Ptr) const {
const ValueToValueMap &Strides =
@@ -487,7 +507,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
// FIXME: We skip these checks when VPlan predication is enabled as we
// want to allow divergent branches. This whole check will be removed
// once VPlan predication is on by default.
- if (!EnableVPlanPredication && Br && Br->isConditional() &&
+ if (Br && Br->isConditional() &&
!TheLoop->isLoopInvariant(Br->getCondition()) &&
!LI->isLoopHeader(Br->getSuccessor(0)) &&
!LI->isLoopHeader(Br->getSuccessor(1))) {
@@ -572,7 +592,7 @@ void LoopVectorizationLegality::addInductionPhi(
// on predicates that only hold within the loop, since allowing the exit
// currently means re-using this SCEV outside the loop (see PR33706 for more
// details).
- if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ if (PSE.getPredicate().isAlwaysTrue()) {
AllowedExit.insert(Phi);
AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
}
@@ -676,7 +696,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
RecurrenceDescriptor RedDes;
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
- DT)) {
+ DT, PSE.getSE())) {
Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
AllowedExit.insert(RedDes.getLoopExitInstr());
Reductions[Phi] = RedDes;
@@ -770,7 +790,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
auto *SE = PSE.getSE();
Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
- if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
reportVectorizationFailure("Found unvectorizable intrinsic",
"intrinsic instruction cannot be vectorized",
@@ -849,7 +869,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// used outside the loop only if the SCEV predicates within the loop is
// same as outside the loop. Allowing the exit means reusing the SCEV
// outside the loop.
- if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ if (PSE.getPredicate().isAlwaysTrue()) {
AllowedExit.insert(&I);
continue;
}
@@ -911,15 +931,70 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
if (!LAI->canVectorizeMemory())
return false;
- if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
- reportVectorizationFailure("Stores to a uniform address",
- "write to a loop invariant address could not be vectorized",
- "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
- return false;
+ // We can vectorize stores to invariant address when final reduction value is
+ // guaranteed to be stored at the end of the loop. Also, if decision to
+ // vectorize loop is made, runtime checks are added so as to make sure that
+ // invariant address won't alias with any other objects.
+ if (!LAI->getStoresToInvariantAddresses().empty()) {
+ // For each invariant address, check its last stored value is unconditional.
+ for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+ if (isInvariantStoreOfReduction(SI) &&
+ blockNeedsPredication(SI->getParent())) {
+ reportVectorizationFailure(
+ "We don't allow storing to uniform addresses",
+ "write of conditional recurring variant value to a loop "
+ "invariant address could not be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
+
+ if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+ // For each invariant address, check its last stored value is the result
+ // of one of our reductions.
+ //
+ // We do not check if dependence with loads exists because they are
+ // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
+ // behaviour changes we have to modify this code.
+ ScalarEvolution *SE = PSE.getSE();
+ SmallVector<StoreInst *, 4> UnhandledStores;
+ for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+ if (isInvariantStoreOfReduction(SI)) {
+ // Earlier stores to this address are effectively deadcode.
+ // With opaque pointers it is possible for one pointer to be used with
+ // different sizes of stored values:
+ // store i32 0, ptr %x
+ // store i8 0, ptr %x
+ // The latest store doesn't complitely overwrite the first one in the
+ // example. That is why we have to make sure that types of stored
+ // values are same.
+ // TODO: Check that bitwidth of unhandled store is smaller then the
+ // one that overwrites it and add a test.
+ erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+ return storeToSameAddress(SE, SI, I) &&
+ I->getValueOperand()->getType() ==
+ SI->getValueOperand()->getType();
+ });
+ continue;
+ }
+ UnhandledStores.push_back(SI);
+ }
+
+ bool IsOK = UnhandledStores.empty();
+ // TODO: we should also validate against InvariantMemSets.
+ if (!IsOK) {
+ reportVectorizationFailure(
+ "We don't allow storing to uniform addresses",
+ "write to a loop invariant address could not "
+ "be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
}
Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
- PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+ PSE.addPredicate(LAI->getPSE().getPredicate());
return true;
}
@@ -949,6 +1024,26 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
}));
}
+bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
+ return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RdxDesc.IntermediateStore == SI;
+ });
+}
+
+bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
+ return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ if (!RdxDesc.IntermediateStore)
+ return false;
+
+ ScalarEvolution *SE = PSE.getSE();
+ Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
+ return V == InvariantAddress ||
+ SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
+ });
+}
+
bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
@@ -969,6 +1064,16 @@ LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
return nullptr;
}
+const InductionDescriptor *
+LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const {
+ if (!isInductionPhi(Phi))
+ return nullptr;
+ auto &ID = getInductionVars().find(Phi)->second;
+ if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
+ return &ID;
+ return nullptr;
+}
+
bool LoopVectorizationLegality::isCastedInductionVariable(
const Value *V) const {
auto *Inst = dyn_cast<Instruction>(V);
@@ -1266,7 +1371,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
- if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
reportVectorizationFailure("Too many SCEV checks needed",
"Too many SCEV assumptions need to be made and checked at runtime",
"TooManySCEVRunTimeChecks", ORE, TheLoop);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 71eb39a18d2f..0cb2032fa45a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -25,6 +25,7 @@
#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
#include "VPlan.h"
+#include "llvm/Support/InstructionCost.h"
namespace llvm {
@@ -59,7 +60,7 @@ class VPBuilder {
}
public:
- VPBuilder() {}
+ VPBuilder() = default;
/// Clear the insertion point: created instructions will not be inserted into
/// a block.
@@ -187,12 +188,16 @@ struct VectorizationFactor {
/// Cost of the loop with that width.
InstructionCost Cost;
- VectorizationFactor(ElementCount Width, InstructionCost Cost)
- : Width(Width), Cost(Cost) {}
+ /// Cost of the scalar loop.
+ InstructionCost ScalarCost;
+
+ VectorizationFactor(ElementCount Width, InstructionCost Cost,
+ InstructionCost ScalarCost)
+ : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
/// Width 1 means no vectorization, cost 0 means uncomputed cost.
static VectorizationFactor Disabled() {
- return {ElementCount::getFixed(1), 0};
+ return {ElementCount::getFixed(1), 0, 0};
}
bool operator==(const VectorizationFactor &rhs) const {
@@ -298,8 +303,12 @@ public:
/// Generate the IR code for the body of the vectorized loop according to the
/// best selected \p VF, \p UF and VPlan \p BestPlan.
+ /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
+ /// vectorization re-using plans for both the main and epilogue vector loops.
+ /// It should be removed once the re-use issue has been fixed.
void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
- InnerLoopVectorizer &LB, DominatorTree *DT);
+ InnerLoopVectorizer &LB, DominatorTree *DT,
+ bool IsEpilogueVectorization);
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printPlans(raw_ostream &O);
@@ -319,6 +328,9 @@ public:
getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
VFRange &Range);
+ /// Check if the number of runtime checks exceeds the threshold.
+ bool requiresTooManyRuntimeChecks() const;
+
protected:
/// Collect the instructions from the original loop that would be trivially
/// dead in the vectorized loop if generated.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 51d2c6237af1..b637b2d5ddae 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,7 +58,6 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanHCFGBuilder.h"
-#include "VPlanPredicator.h"
#include "VPlanTransforms.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -112,7 +111,6 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
@@ -144,10 +142,10 @@
#include <algorithm>
#include <cassert>
#include <cstdint>
-#include <cstdlib>
#include <functional>
#include <iterator>
#include <limits>
+#include <map>
#include <memory>
#include <string>
#include <tuple>
@@ -346,13 +344,6 @@ cl::opt<bool> EnableVPlanNativePath(
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));
-// FIXME: Remove this switch once we have divergence analysis. Currently we
-// assume divergent non-backedge branches when this switch is true.
-cl::opt<bool> EnableVPlanPredication(
- "enable-vplan-predication", cl::init(false), cl::Hidden,
- cl::desc("Enable VPlan-native vectorization path predicator with "
- "support for outer loop vectorization."));
-
// This flag enables the stress testing of the VPlan H-CFG construction in the
// VPlan-native vectorization path. It must be used in conjuction with
// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
@@ -481,7 +472,7 @@ public:
VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
- void fixVectorizedLoop(VPTransformState &State);
+ void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -491,12 +482,6 @@ public:
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;
- /// Vectorize a single first-order recurrence or pointer induction PHINode in
- /// a block. This method handles the induction variable canonicalization. It
- /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
- VPTransformState &State);
-
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
@@ -506,13 +491,6 @@ public:
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
- /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
- /// is provided, the integer induction variable will first be truncated to
- /// the corresponding type. \p CanonicalIV is the scalar value generated for
- /// the canonical induction variable.
- void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
- VPTransformState &State, Value *CanonicalIV);
-
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
VPTransformState &State);
@@ -527,13 +505,8 @@ public:
ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask = nullptr);
- /// Set the debug location in the builder \p Ptr using the debug location in
- /// \p V. If \p Ptr is None then it uses the class member's Builder.
- void setDebugLocFromInst(const Value *V,
- Optional<IRBuilder<> *> CustomBuilder = None);
-
- /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
- void fixNonInductionPHIs(VPTransformState &State);
+ /// Fix the non-induction PHIs in \p Plan.
+ void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
/// Returns true if the reordering of FP operations is not allowed, but we are
/// able to vectorize with strict in-order reductions for the given RdxDesc.
@@ -546,17 +519,6 @@ public:
/// element.
virtual Value *getBroadcastInstrs(Value *V);
- /// Add metadata from one instruction to another.
- ///
- /// This includes both the original MDs from \p From and additional ones (\see
- /// addNewMetadata). Use this for *newly created* instructions in the vector
- /// loop.
- void addMetadata(Instruction *To, Instruction *From);
-
- /// Similar to the previous function but it adds the metadata to a
- /// vector of instructions.
- void addMetadata(ArrayRef<Value *> To, Instruction *From);
-
// Returns the resume value (bc.merge.rdx) for a reduction as
// generated by fixReduction.
PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
@@ -575,13 +537,9 @@ protected:
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
- Value *CountRoundDown, Value *EndValue,
- BasicBlock *MiddleBlock);
-
- /// Introduce a conditional branch (on true, condition to be set later) at the
- /// end of the header=latch connecting it to itself (across the backedge) and
- /// to the exit block of \p L.
- void createHeaderBranch(Loop *L);
+ Value *VectorTripCount, Value *EndValue,
+ BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
+ VPlan &Plan);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
@@ -595,16 +553,9 @@ protected:
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
/// Clear NSW/NUW flags from reduction instructions if necessary.
- void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+ void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
VPTransformState &State);
- /// Fixup the LCSSA phi nodes in the unique exit block. This simply
- /// means we need to add the appropriate incoming value from the middle
- /// block as exiting edges from the scalar epilogue loop (if present) are
- /// already in place, and we exit the vector loop exclusively to the middle
- /// block.
- void fixLCSSAPHIs(VPTransformState &State);
-
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -613,30 +564,11 @@ protected:
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);
- /// Compute scalar induction steps. \p ScalarIV is the scalar induction
- /// variable on which to base the steps, \p Step is the size of the step, and
- /// \p EntryVal is the value from the original loop that maps to the steps.
- /// Note that \p EntryVal doesn't have to be an induction variable - it
- /// can also be a truncate instruction.
- void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
- const InductionDescriptor &ID, VPValue *Def,
- VPTransformState &State);
-
- /// Create a vector induction phi node based on an existing scalar one. \p
- /// EntryVal is the value from the original loop that maps to the vector phi
- /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
- /// truncate instruction, instead of widening the original IV, we widen a
- /// version of the IV truncated to \p EntryVal's type.
- void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
- Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def,
- VPTransformState &State);
-
/// Returns (and creates if needed) the original loop trip count.
- Value *getOrCreateTripCount(Loop *NewLoop);
+ Value *getOrCreateTripCount(BasicBlock *InsertBlock);
/// Returns (and creates if needed) the trip count of the widened loop.
- Value *getOrCreateVectorTripCount(Loop *NewLoop);
+ Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
/// Returns a bitcasted value to the requested vector type.
/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
@@ -645,33 +577,21 @@ protected:
/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
- void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+ void emitIterationCountCheck(BasicBlock *Bypass);
/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct. Returns the block containing the checks or
/// nullptr if no checks have been added.
- BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+ BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
/// Emit bypass checks to check any memory assumptions we may have made.
/// Returns the block containing the checks or nullptr if no checks have been
/// added.
- BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
- /// Compute the transformed value of Index at offset StartValue using step
- /// StepValue.
- /// For integer induction, returns StartValue + Index * StepValue.
- /// For pointer induction, returns StartValue[Index * StepValue].
- /// FIXME: The newly created binary instructions should contain nsw/nuw
- /// flags, which can be found from the original scalar operations.
- Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
- const DataLayout &DL,
- const InductionDescriptor &ID,
- BasicBlock *VectorHeader) const;
+ BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader. Also
- /// allocate a loop object for the new vector loop and return it.
- Loop *createVectorLoopSkeleton(StringRef Prefix);
+ /// vector loop preheader, middle block and scalar preheader.
+ void createVectorLoopSkeleton(StringRef Prefix);
/// Create new phi nodes for the induction variables to resume iteration count
/// in the scalar epilogue, from where the vectorized loop left off.
@@ -680,21 +600,12 @@ protected:
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
- Loop *L,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
/// Complete the loop skeleton by adding debug MDs, creating appropriate
/// conditional branches in the middle block, preparing the builder and
- /// running the verifier. Take in the vector loop \p L as argument, and return
- /// the preheader of the completed vector loop.
- BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
-
- /// Add additional metadata to \p To that was not present on \p Orig.
- ///
- /// Currently this is used to add the noalias annotations based on the
- /// inserted memchecks. Use this for instructions that are *cloned* into the
- /// vector loop.
- void addNewMetadata(Instruction *To, const Instruction *Orig);
+ /// running the verifier. Return the preheader of the completed vector loop.
+ BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
/// Collect poison-generating recipes that may generate a poison value that is
/// used after vectorization, even when their operands are not poison. Those
@@ -741,13 +652,6 @@ protected:
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- /// LoopVersioning. It's only set up (non-null) if memchecks were
- /// used.
- ///
- /// This is currently only used to add no-alias metadata based on the
- /// memchecks. The actually versioning is performed manually.
- std::unique_ptr<LoopVersioning> LVer;
-
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
ElementCount VF;
@@ -774,9 +678,6 @@ protected:
/// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;
- /// The vector loop body.
- BasicBlock *LoopVectorBody;
-
/// The scalar loop body.
BasicBlock *LoopScalarBody;
@@ -805,10 +706,6 @@ protected:
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;
- // Vector of original scalar PHIs whose corresponding widened PHIs need to be
- // fixed up at the end of vector code generation.
- SmallVector<PHINode *, 8> OrigPHIsToFix;
-
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
@@ -936,8 +833,7 @@ protected:
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
- BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
- bool ForEpilogue);
+ BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;
};
@@ -956,7 +852,9 @@ public:
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
GeneratedRTChecks &Checks)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Checks) {}
+ EPI, LVL, CM, BFI, PSI, Checks) {
+ TripCount = EPI.TripCount;
+ }
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
std::pair<BasicBlock *, Value *>
@@ -966,7 +864,7 @@ protected:
/// Emits an iteration count bypass check after the main vector loop has
/// finished to see if there are any iterations left to execute by either
/// the vector epilogue or the scalar epilogue.
- BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+ BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
BasicBlock *Bypass,
BasicBlock *Insert);
void printDebugTracesAtStart() override;
@@ -993,31 +891,6 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
return I;
}
-void InnerLoopVectorizer::setDebugLocFromInst(
- const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
- IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
- const DILocation *DIL = Inst->getDebugLoc();
-
- // When a FSDiscriminator is enabled, we don't need to add the multiply
- // factors to the discriminators.
- if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
- // FIXME: For scalable vectors, assume vscale=1.
- auto NewDIL =
- DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
- if (NewDIL)
- B->SetCurrentDebugLocation(NewDIL.getValue());
- else
- LLVM_DEBUG(dbgs()
- << "Failed to create new discriminator: "
- << DIL->getFilename() << " Line: " << DIL->getLine());
- } else
- B->SetCurrentDebugLocation(DIL);
- } else
- B->SetCurrentDebugLocation(DebugLoc());
-}
-
/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
/// is passed, the message relates to that particular instruction.
#ifndef NDEBUG
@@ -1059,7 +932,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
namespace llvm {
/// Return a value for Step multiplied by VF.
-Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
@@ -1067,12 +940,13 @@ Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
}
/// Return the runtime value for VF.
-Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(EC) : EC;
}
-static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
+static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
+ ElementCount VF) {
assert(FTy->isFloatingPointTy() && "Expected floating point type!");
Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
@@ -1119,14 +993,6 @@ static std::string getDebugLocString(const Loop *L) {
}
#endif
-void InnerLoopVectorizer::addNewMetadata(Instruction *To,
- const Instruction *Orig) {
- // If the loop was versioned with memchecks, add the corresponding no-alias
- // metadata.
- if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
- LVer->annotateInstWithNoAlias(To, Orig);
-}
-
void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
VPTransformState &State) {
@@ -1151,6 +1017,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
// handled.
if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
isa<VPInterleaveRecipe>(CurRec) ||
+ isa<VPScalarIVStepsRecipe>(CurRec) ||
isa<VPCanonicalIVPHIRecipe>(CurRec))
continue;
@@ -1176,10 +1043,10 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &Recipe : *VPBB) {
if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
- Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
+ Instruction &UnderlyingInstr = WidenRec->getIngredient();
VPDef *AddrDef = WidenRec->getAddr()->getDef();
- if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
- Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
+ if (AddrDef && WidenRec->isConsecutive() &&
+ Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
collectPoisonGeneratingInstrsInBackwardSlice(
cast<VPRecipeBase>(AddrDef));
} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
@@ -1206,20 +1073,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
}
}
-void InnerLoopVectorizer::addMetadata(Instruction *To,
- Instruction *From) {
- propagateMetadata(To, From);
- addNewMetadata(To, From);
-}
-
-void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
- Instruction *From) {
- for (Value *V : To) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- addMetadata(I, From);
- }
-}
-
PHINode *InnerLoopVectorizer::getReductionResumeValue(
const RecurrenceDescriptor &RdxDesc) {
auto It = ReductionResumeValues.find(&RdxDesc);
@@ -1363,7 +1216,7 @@ public:
/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
/// of FP operations.
- bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
return !Hints->allowReordering() && RdxDesc.isOrdered();
}
@@ -1718,15 +1571,10 @@ private:
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
/// This is a helper function of computeFeasibleMaxVF.
- /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
- /// issue that occurred on one of the buildbots which cannot be reproduced
- /// without having access to the properietary compiler (see comments on
- /// D98509). The issue is currently under investigation and this workaround
- /// will be removed as soon as possible.
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
unsigned SmallestType,
unsigned WidestType,
- const ElementCount &MaxSafeVF,
+ ElementCount MaxSafeVF,
bool FoldTailByMasking);
/// \return the maximum legal scalable VF, based on the safe max number
@@ -2017,7 +1865,7 @@ public:
/// there is no vector code generation, the check blocks are removed
/// completely.
void Create(Loop *L, const LoopAccessInfo &LAI,
- const SCEVUnionPredicate &UnionPred) {
+ const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
BasicBlock *LoopHeader = L->getHeader();
BasicBlock *Preheader = L->getLoopPreheader();
@@ -2040,9 +1888,19 @@ public:
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
- MemRuntimeCheckCond =
- addRuntimeChecks(MemCheckBlock->getTerminator(), L,
- RtPtrChecking.getChecks(), MemCheckExp);
+ auto DiffChecks = RtPtrChecking.getDiffChecks();
+ if (DiffChecks) {
+ MemRuntimeCheckCond = addDiffRuntimeChecks(
+ MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
+ [VF](IRBuilderBase &B, unsigned Bits) {
+ return getRuntimeVF(B, B.getIntNTy(Bits), VF);
+ },
+ IC);
+ } else {
+ MemRuntimeCheckCond =
+ addRuntimeChecks(MemCheckBlock->getTerminator(), L,
+ RtPtrChecking.getChecks(), MemCheckExp);
+ }
assert(MemRuntimeCheckCond &&
"no RT checks generated although RtPtrChecking "
"claimed checks are required");
@@ -2114,12 +1972,16 @@ public:
/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
/// adjusts the branches to branch to the vector preheader or \p Bypass,
/// depending on the generated condition.
- BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
+ BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
BasicBlock *LoopVectorPreHeader,
BasicBlock *LoopExitBlock) {
if (!SCEVCheckCond)
return nullptr;
- if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
+
+ Value *Cond = SCEVCheckCond;
+ // Mark the check as used, to prevent it from being removed during cleanup.
+ SCEVCheckCond = nullptr;
+ if (auto *C = dyn_cast<ConstantInt>(Cond))
if (C->isZero())
return nullptr;
@@ -2138,18 +2000,15 @@ public:
DT->addNewBlock(SCEVCheckBlock, Pred);
DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
- ReplaceInstWithInst(
- SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
- // Mark the check as used, to prevent it from being removed during cleanup.
- SCEVCheckCond = nullptr;
+ ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
return SCEVCheckBlock;
}
/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
/// the branches to branch to the vector preheader or \p Bypass, depending on
/// the generated condition.
- BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
+ BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
BasicBlock *LoopVectorPreHeader) {
// Check if we generated code that checks in runtime if arrays overlap.
if (!MemRuntimeCheckCond)
@@ -2346,7 +2205,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
/// \p Opcode is relevant for FP induction variable.
static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
Instruction::BinaryOps BinOp, ElementCount VF,
- IRBuilder<> &Builder) {
+ IRBuilderBase &Builder) {
assert(VF.isVector() && "only vector VFs are supported");
// Create and check the types.
@@ -2362,9 +2221,8 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
// Create a vector of consecutive numbers from zero to VF.
VectorType *InitVecValVTy = ValVTy;
- Type *InitVecValSTy = STy;
if (STy->isFloatingPointTy()) {
- InitVecValSTy =
+ Type *InitVecValSTy =
IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
}
@@ -2394,198 +2252,12 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
}
-void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
- const InductionDescriptor &II, Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
- IRBuilder<> &Builder = State.Builder;
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // Construct the initial value of the vector IV in the vector loop preheader
- auto CurrIP = Builder.saveIP();
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- if (isa<TruncInst>(EntryVal)) {
- assert(Start->getType()->isIntegerTy() &&
- "Truncation requires an integer type");
- auto *TruncType = cast<IntegerType>(EntryVal->getType());
- Step = Builder.CreateTrunc(Step, TruncType);
- Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
- }
-
- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
- Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(
- SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
-
- // We create vector phi nodes for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (Step->getType()->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = II.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Multiply the vectorization factor by the step using integer or
- // floating-point arithmetic as appropriate.
- Type *StepType = Step->getType();
- Value *RuntimeVF;
- if (Step->getType()->isFloatingPointTy())
- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
- else
- RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
- Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
- // Create a vector splat to use in the induction update.
- //
- // FIXME: If the step is non-constant, we create the vector splat with
- // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
- // handle a constant vector splat.
- Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(State.VF, Mul);
- Builder.restoreIP(CurrIP);
-
- // We may need to add the step a number of times, depending on the unroll
- // factor. The last of those goes into the PHI.
- PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
- &*LoopVectorBody->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
- Instruction *LastInduction = VecInd;
- for (unsigned Part = 0; Part < UF; ++Part) {
- State.set(Def, LastInduction, Part);
-
- if (isa<TruncInst>(EntryVal))
- addMetadata(LastInduction, EntryVal);
-
- LastInduction = cast<Instruction>(
- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
- }
-
- // Move the last step to the end of the latch block. This ensures consistent
- // placement of all induction updates.
- auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
- auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
- LastInduction->moveBefore(Br);
- LastInduction->setName("vec.ind.next");
-
- VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
- VecInd->addIncoming(LastInduction, LoopVectorLatch);
-}
-
-void InnerLoopVectorizer::widenIntOrFpInduction(
- PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
- Value *CanonicalIV) {
- Value *Start = Def->getStartValue()->getLiveInIRValue();
- const InductionDescriptor &ID = Def->getInductionDescriptor();
- TruncInst *Trunc = Def->getTruncInst();
- IRBuilder<> &Builder = State.Builder;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
- assert(!State.VF.isZero() && "VF must be non-zero");
-
- // The value from the original loop to which we are mapping the new induction
- // variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
-
- auto &DL = EntryVal->getModule()->getDataLayout();
-
- // Generate code for the induction step. Note that induction steps are
- // required to be loop-invariant
- auto CreateStepValue = [&](const SCEV *Step) -> Value * {
- assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
- "Induction step should be loop invariant");
- if (PSE.getSE()->isSCEVable(IV->getType())) {
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- return Exp.expandCodeFor(Step, Step->getType(),
- State.CFG.VectorPreHeader->getTerminator());
- }
- return cast<SCEVUnknown>(Step)->getValue();
- };
-
- // The scalar value to broadcast. This is derived from the canonical
- // induction variable. If a truncation type is given, truncate the canonical
- // induction variable and step. Otherwise, derive these values from the
- // induction descriptor.
- auto CreateScalarIV = [&](Value *&Step) -> Value * {
- Value *ScalarIV = CanonicalIV;
- Type *NeededType = IV->getType();
- if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
- ScalarIV =
- NeededType->isIntegerTy()
- ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
- : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
- ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
- State.CFG.PrevBB);
- ScalarIV->setName("offset.idx");
- }
- if (Trunc) {
- auto *TruncType = cast<IntegerType>(Trunc->getType());
- assert(Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
- Step = Builder.CreateTrunc(Step, TruncType);
- }
- return ScalarIV;
- };
-
- // Fast-math-flags propagate from the original induction instruction.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
- Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
- // Now do the actual transformations, and start with creating the step value.
- Value *Step = CreateStepValue(ID.getStep());
- if (State.VF.isScalar()) {
- Value *ScalarIV = CreateScalarIV(Step);
- Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
- Step->getType()->getScalarSizeInBits());
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *StartIdx = ConstantInt::get(ScalarTy, Part);
- Value *EntryPart;
- if (Step->getType()->isFloatingPointTy()) {
- StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
- Value *MulOp = Builder.CreateFMul(StartIdx, Step);
- EntryPart = Builder.CreateBinOp(ID.getInductionOpcode(), ScalarIV,
- MulOp, "induction");
- } else {
- EntryPart = Builder.CreateAdd(
- ScalarIV, Builder.CreateMul(StartIdx, Step), "induction");
- }
- State.set(Def, EntryPart, Part);
- if (Trunc) {
- assert(!Step->getType()->isFloatingPointTy() &&
- "fp inductions shouldn't be truncated");
- addMetadata(EntryPart, Trunc);
- }
- }
- return;
- }
-
- // Create a new independent vector induction variable, if one is needed.
- if (Def->needsVectorIV())
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
-
- if (Def->needsScalarIV()) {
- // Create scalar steps that can be used by instructions we will later
- // scalarize. Note that the addition of the scalar steps will not increase
- // the number of instructions in the loop in the common case prior to
- // InstCombine. We will be trading one vector extract for each scalar step.
- Value *ScalarIV = CreateScalarIV(Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
- }
-}
-
-void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
- Instruction *EntryVal,
- const InductionDescriptor &ID,
- VPValue *Def,
- VPTransformState &State) {
- IRBuilder<> &Builder = State.Builder;
+/// Compute scalar induction steps. \p ScalarIV is the scalar induction
+/// variable on which to base the steps, \p Step is the size of the step.
+static void buildScalarSteps(Value *ScalarIV, Value *Step,
+ const InductionDescriptor &ID, VPValue *Def,
+ VPTransformState &State) {
+ IRBuilderBase &Builder = State.Builder;
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(State.VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
@@ -2656,6 +2328,103 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
}
}
+// Generate code for the induction step. Note that induction steps are
+// required to be loop-invariant
+static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
+ Instruction *InsertBefore,
+ Loop *OrigLoop = nullptr) {
+ const DataLayout &DL = SE.getDataLayout();
+ assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
+ "Induction step should be loop invariant");
+ if (auto *E = dyn_cast<SCEVUnknown>(Step))
+ return E->getValue();
+
+ SCEVExpander Exp(SE, DL, "induction");
+ return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
+}
+
+/// Compute the transformed value of Index at offset StartValue using step
+/// StepValue.
+/// For integer induction, returns StartValue + Index * StepValue.
+/// For pointer induction, returns StartValue[Index * StepValue].
+/// FIXME: The newly created binary instructions should contain nsw/nuw
+/// flags, which can be found from the original scalar operations.
+static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
+ Value *StartValue, Value *Step,
+ const InductionDescriptor &ID) {
+ assert(Index->getType()->getScalarType() == Step->getType() &&
+ "Index scalar type does not match StepValue type");
+
+ // Note: the IR at this point is broken. We cannot use SE to create any new
+ // SCEV and then expand it, hoping that SCEV's simplification will give us
+ // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+ // lead to various SCEV crashes. So all we can do is to use builder and rely
+ // on InstCombine for future simplifications. Here we handle some trivial
+ // cases only.
+ auto CreateAdd = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isZero())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isZero())
+ return X;
+ return B.CreateAdd(X, Y);
+ };
+
+ // We allow X to be a vector type, in which case Y will potentially be
+ // splatted into a vector with the same element count.
+ auto CreateMul = [&B](Value *X, Value *Y) {
+ assert(X->getType()->getScalarType() == Y->getType() &&
+ "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isOne())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isOne())
+ return X;
+ VectorType *XVTy = dyn_cast<VectorType>(X->getType());
+ if (XVTy && !isa<VectorType>(Y->getType()))
+ Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
+ return B.CreateMul(X, Y);
+ };
+
+ switch (ID.getKind()) {
+ case InductionDescriptor::IK_IntInduction: {
+ assert(!isa<VectorType>(Index->getType()) &&
+ "Vector indices not supported for integer inductions yet");
+ assert(Index->getType() == StartValue->getType() &&
+ "Index type does not match StartValue type");
+ if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
+ return B.CreateSub(StartValue, Index);
+ auto *Offset = CreateMul(Index, Step);
+ return CreateAdd(StartValue, Offset);
+ }
+ case InductionDescriptor::IK_PtrInduction: {
+ assert(isa<Constant>(Step) &&
+ "Expected constant step for pointer induction");
+ return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
+ }
+ case InductionDescriptor::IK_FpInduction: {
+ assert(!isa<VectorType>(Index->getType()) &&
+ "Vector indices not supported for FP inductions yet");
+ assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+ auto InductionBinOp = ID.getInductionBinOp();
+ assert(InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub) &&
+ "Original bin op should be defined for FP induction");
+
+ Value *MulExp = B.CreateFMul(Step, Index);
+ return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+ "induction");
+ }
+ case InductionDescriptor::IK_NoInduction:
+ return nullptr;
+ }
+ llvm_unreachable("invalid enum");
+}
+
void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
const VPIteration &Instance,
VPTransformState &State) {
@@ -2738,7 +2507,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
- setDebugLocFromInst(AddrPart);
+ State.setDebugLocFromInst(AddrPart);
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2764,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
}
- setDebugLocFromInst(Instr);
+ State.setDebugLocFromInst(Instr);
Value *PoisonVec = PoisonValue::get(VecTy);
Value *MaskForGaps = nullptr;
@@ -2919,8 +2688,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
if (!Instance.isFirstIteration())
return;
- setDebugLocFromInst(Instr);
-
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2937,21 +2704,23 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
Cloned->dropPoisonGeneratingFlags();
- State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
- Builder.GetInsertPoint());
+ if (Instr->getDebugLoc())
+ State.setDebugLocFromInst(Instr);
+
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (auto &I : enumerate(RepRecipe->operands())) {
auto InputInstance = Instance;
VPValue *Operand = I.value();
- if (State.Plan->isUniformAfterVectorization(Operand))
+ VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
+ if (OperandR && OperandR->isUniform())
InputInstance.Lane = VPLane::getFirstLane();
Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
}
- addNewMetadata(Cloned, Instr);
+ State.addNewMetadata(Cloned, Instr);
// Place the cloned scalar in the new loop.
- Builder.Insert(Cloned);
+ State.Builder.Insert(Cloned);
State.set(RepRecipe, Cloned, Instance);
@@ -2964,29 +2733,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
PredicatedInstructions.push_back(Cloned);
}
-void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
- BasicBlock *Header = L->getHeader();
- assert(!L->getLoopLatch() && "loop should not have a latch at this point");
-
- IRBuilder<> B(Header->getTerminator());
- Instruction *OldInst =
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
- setDebugLocFromInst(OldInst, &B);
-
- // Connect the header to the exit and header blocks and replace the old
- // terminator.
- B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
-
- // Now we have two terminators. Remove the old one from the block.
- Header->getTerminator()->eraseFromParent();
-}
-
-Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
if (TripCount)
return TripCount;
- assert(L && "Create Trip Count for null loop.");
- IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ assert(InsertBlock);
+ IRBuilder<> Builder(InsertBlock->getTerminator());
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
@@ -3010,7 +2762,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
@@ -3018,22 +2770,23 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
// Count holds the overall loop count (N).
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- L->getLoopPreheader()->getTerminator());
+ InsertBlock->getTerminator());
if (TripCount->getType()->isPointerTy())
TripCount =
CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
- L->getLoopPreheader()->getTerminator());
+ InsertBlock->getTerminator());
return TripCount;
}
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+Value *
+InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (VectorTripCount)
return VectorTripCount;
- Value *TC = getOrCreateTripCount(L);
- IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ Value *TC = getOrCreateTripCount(InsertBlock);
+ IRBuilder<> Builder(InsertBlock->getTerminator());
Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.
@@ -3045,6 +2798,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// overflows: the vector induction variable will eventually wrap to zero given
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
+ // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+ // is accounted for in emitIterationCountCheck that adds an overflow check.
if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
@@ -3107,9 +2862,8 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
}
-void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
- BasicBlock *Bypass) {
- Value *Count = getOrCreateTripCount(L);
+void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
+ Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -3124,10 +2878,23 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
: ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
+ Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
- if (!Cost->foldTailByMasking()) {
- Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
+ Value *Step = createStepForVF(Builder, CountTy, VF, UF);
+ if (!Cost->foldTailByMasking())
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+ else if (VF.isScalable()) {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+
+ // Get the maximum unsigned value for the type.
+ Value *MaxUIntTripCount =
+ ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
+ Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
+
+ // Don't execute the vector loop if (UMax - n) < (VF * UF).
+ CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
}
// Create new preheader for vector loop.
LoopVectorPreHeader =
@@ -3152,10 +2919,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
LoopBypassBlocks.push_back(TCCheckBlock);
}
-BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
BasicBlock *const SCEVCheckBlock =
- RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
+ RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
if (!SCEVCheckBlock)
return nullptr;
@@ -3180,14 +2947,13 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
return SCEVCheckBlock;
}
-BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
- BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
// VPlan-native path does not do any analysis for runtime checks currently.
if (EnableVPlanNativePath)
return nullptr;
BasicBlock *const MemCheckBlock =
- RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
+ RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
// Check if we generated code that checks in runtime if arrays overlap. We put
// the checks into a separate block to make the more common case of few
@@ -3201,7 +2967,8 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
"to vectorize.");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
- L->getStartLoc(), L->getHeader())
+ OrigLoop->getStartLoc(),
+ OrigLoop->getHeader())
<< "Code-size may be reduced by not forcing "
"vectorization, or by source-code modifications "
"eliminating the need for runtime checks "
@@ -3213,116 +2980,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
AddedSafetyChecks = true;
- // We currently don't use LoopVersioning for the actual loop cloning but we
- // still use it to add the noalias metadata.
- LVer = std::make_unique<LoopVersioning>(
- *Legal->getLAI(),
- Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
- DT, PSE.getSE());
- LVer->prepareNoAliasMetadata();
return MemCheckBlock;
}
-Value *InnerLoopVectorizer::emitTransformedIndex(
- IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
- const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
-
- SCEVExpander Exp(*SE, DL, "induction");
- auto Step = ID.getStep();
- auto StartValue = ID.getStartValue();
- assert(Index->getType()->getScalarType() == Step->getType() &&
- "Index scalar type does not match StepValue type");
-
- // Note: the IR at this point is broken. We cannot use SE to create any new
- // SCEV and then expand it, hoping that SCEV's simplification will give us
- // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
- // lead to various SCEV crashes. So all we can do is to use builder and rely
- // on InstCombine for future simplifications. Here we handle some trivial
- // cases only.
- auto CreateAdd = [&B](Value *X, Value *Y) {
- assert(X->getType() == Y->getType() && "Types don't match!");
- if (auto *CX = dyn_cast<ConstantInt>(X))
- if (CX->isZero())
- return Y;
- if (auto *CY = dyn_cast<ConstantInt>(Y))
- if (CY->isZero())
- return X;
- return B.CreateAdd(X, Y);
- };
-
- // We allow X to be a vector type, in which case Y will potentially be
- // splatted into a vector with the same element count.
- auto CreateMul = [&B](Value *X, Value *Y) {
- assert(X->getType()->getScalarType() == Y->getType() &&
- "Types don't match!");
- if (auto *CX = dyn_cast<ConstantInt>(X))
- if (CX->isOne())
- return Y;
- if (auto *CY = dyn_cast<ConstantInt>(Y))
- if (CY->isOne())
- return X;
- VectorType *XVTy = dyn_cast<VectorType>(X->getType());
- if (XVTy && !isa<VectorType>(Y->getType()))
- Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
- return B.CreateMul(X, Y);
- };
-
- // Get a suitable insert point for SCEV expansion. For blocks in the vector
- // loop, choose the end of the vector loop header (=VectorHeader), because
- // the DomTree is not kept up-to-date for additional blocks generated in the
- // vector loop. By using the header as insertion point, we guarantee that the
- // expanded instructions dominate all their uses.
- auto GetInsertPoint = [this, &B, VectorHeader]() {
- BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
- if (InsertBB != LoopVectorBody &&
- LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
- return VectorHeader->getTerminator();
- return &*B.GetInsertPoint();
- };
-
- switch (ID.getKind()) {
- case InductionDescriptor::IK_IntInduction: {
- assert(!isa<VectorType>(Index->getType()) &&
- "Vector indices not supported for integer inductions yet");
- assert(Index->getType() == StartValue->getType() &&
- "Index type does not match StartValue type");
- if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
- return B.CreateSub(StartValue, Index);
- auto *Offset = CreateMul(
- Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
- return CreateAdd(StartValue, Offset);
- }
- case InductionDescriptor::IK_PtrInduction: {
- assert(isa<SCEVConstant>(Step) &&
- "Expected constant step for pointer induction");
- return B.CreateGEP(
- ID.getElementType(), StartValue,
- CreateMul(Index,
- Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
- GetInsertPoint())));
- }
- case InductionDescriptor::IK_FpInduction: {
- assert(!isa<VectorType>(Index->getType()) &&
- "Vector indices not supported for FP inductions yet");
- assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
- auto InductionBinOp = ID.getInductionBinOp();
- assert(InductionBinOp &&
- (InductionBinOp->getOpcode() == Instruction::FAdd ||
- InductionBinOp->getOpcode() == Instruction::FSub) &&
- "Original bin op should be defined for FP induction");
-
- Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
- Value *MulExp = B.CreateFMul(StepValue, Index);
- return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
- "induction");
- }
- case InductionDescriptor::IK_NoInduction:
- return nullptr;
- }
- llvm_unreachable("invalid enum");
-}
-
-Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -3354,43 +3015,24 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
- // We intentionally don't let SplitBlock to update LoopInfo since
- // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
- // LoopVectorBody is explicitly added to the correct place few lines later.
- LoopVectorBody =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- nullptr, nullptr, Twine(Prefix) + "vector.body");
-
- // Update dominator for loop exit.
+ // Update dominator for loop exit. During skeleton creation, only the vector
+ // pre-header and the middle block are created. The vector loop is entirely
+ // created during VPlan exection.
if (!Cost->requiresScalarEpilogue(VF))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
-
- // Create and register the new vector loop.
- Loop *Lp = LI->AllocateLoop();
- Loop *ParentLoop = OrigLoop->getParentLoop();
-
- // Insert the new loop into the loop nest and register the new basic blocks
- // before calling any utilities such as SCEV that require valid LoopInfo.
- if (ParentLoop) {
- ParentLoop->addChildLoop(Lp);
- } else {
- LI->addTopLevelLoop(Lp);
- }
- Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
- return Lp;
}
void InnerLoopVectorizer::createInductionResumeValues(
- Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
+ std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
"Inconsistent information about additional bypass.");
- Value *VectorTripCount = getOrCreateVectorTripCount(L);
- assert(VectorTripCount && L && "Expected valid arguments");
+ Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
+ assert(VectorTripCount && "Expected valid arguments");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3403,19 +3045,13 @@ void InnerLoopVectorizer::createInductionResumeValues(
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getTerminator());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = VectorTripCount;
} else {
- IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+ IRBuilder<> B(LoopVectorPreHeader->getTerminator());
// Fast-math-flags propagate from the original induction instruction.
if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
@@ -3424,10 +3060,10 @@ void InnerLoopVectorizer::createInductionResumeValues(
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
- Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
- const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
- EndValue =
- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
+ Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
+ Value *Step =
+ CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+ EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
@@ -3435,13 +3071,23 @@ void InnerLoopVectorizer::createInductionResumeValues(
B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
StepType, true);
- CRD =
- B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+ Value *Step =
+ CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+ VTC =
+ B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
EndValueFromAdditionalBypass =
- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
+ emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
EndValueFromAdditionalBypass->setName("ind.end");
}
}
+
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal =
+ PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+ LoopScalarPreHeader->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3460,13 +3106,10 @@ void InnerLoopVectorizer::createInductionResumeValues(
}
}
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
- MDNode *OrigLoopID) {
- assert(L && "Expected valid loop.");
-
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
// The trip counts should be cached by now.
- Value *Count = getOrCreateTripCount(L);
- Value *VectorTripCount = getOrCreateVectorTripCount(L);
+ Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
@@ -3491,14 +3134,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
}
- // Get ready to start creating new instructions into the vectorized body.
- assert(LoopVectorPreHeader == L->getLoopPreheader() &&
- "Inconsistent vector loop preheader");
- Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
-
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
- LI->verify(*DT);
#endif
return LoopVectorPreHeader;
@@ -3521,7 +3158,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
|/ |
| v
| [ ] \
- | [ ]_| <-- vector loop.
+ | [ ]_| <-- vector loop (created during VPlan execution).
| |
| v
\ -[ ] <--- middle-block.
@@ -3548,34 +3185,32 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// simply happens to be prone to hitting this in practice. In theory, we
// can hit the same issue for any SCEV, or ValueTracking query done during
// mutation. See PR49900.
- getOrCreateTripCount(OrigLoop);
+ getOrCreateTripCount(OrigLoop->getLoopPreheader());
// Create an empty vector loop, and prepare basic blocks for the runtime
// checks.
- Loop *Lp = createVectorLoopSkeleton("");
+ createVectorLoopSkeleton("");
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop. This check also covers the case where the
// backedge-taken count is uint##_max: adding one to it will overflow leading
// to an incorrect trip count of zero. In this (rare) case we will also jump
// to the scalar loop.
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+ emitIterationCountCheck(LoopScalarPreHeader);
// Generate the code to check any assumptions that we've made for SCEV
// expressions.
- emitSCEVChecks(Lp, LoopScalarPreHeader);
+ emitSCEVChecks(LoopScalarPreHeader);
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
- createHeaderBranch(Lp);
+ emitMemRuntimeChecks(LoopScalarPreHeader);
// Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues(Lp);
+ createInductionResumeValues();
- return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
+ return {completeLoopSkeleton(OrigLoopID), nullptr};
}
// Fix up external users of the induction variable. At this point, we are
@@ -3584,8 +3219,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// value for the IV when arriving directly from the middle block.
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
- Value *CountRoundDown, Value *EndValue,
- BasicBlock *MiddleBlock) {
+ Value *VectorTripCount, Value *EndValue,
+ BasicBlock *MiddleBlock,
+ BasicBlock *VectorHeader, VPlan &Plan) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
@@ -3612,8 +3248,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
for (User *U : OrigPhi->users()) {
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
- const DataLayout &DL =
- OrigLoop->getHeader()->getModule()->getDataLayout();
assert(isa<PHINode>(UI) && "Expected LCSSA form");
IRBuilder<> B(MiddleBlock->getTerminator());
@@ -3623,15 +3257,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
Value *CountMinusOne = B.CreateSub(
- CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+ VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
Value *CMO =
!II.getStep()->getType()->isIntegerTy()
? B.CreateCast(Instruction::SIToFP, CountMinusOne,
II.getStep()->getType())
: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
CMO->setName("cast.cmo");
+
+ Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
+ VectorHeader->getTerminator());
Value *Escape =
- emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
+ emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -3644,8 +3281,10 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// In this case, if IV1 has an external use, we need to avoid adding both
// "last value of IV1" and "penultimate value of IV2". So, verify that we
// don't already have an incoming value for the middle block.
- if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+ if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
PHI->addIncoming(I.second, MiddleBlock);
+ Plan.removeLiveOut(PHI);
+ }
}
}
@@ -3924,18 +3563,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
}
}
-void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
+void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
+ VPlan &Plan) {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF.isVector())
truncateToMinimalBitwidths(State);
// Fix widened non-induction PHIs by setting up the PHI operands.
- if (OrigPHIsToFix.size()) {
- assert(EnableVPlanNativePath &&
- "Unexpected non-induction PHIs for fixup in non VPlan-native path");
- fixNonInductionPHIs(State);
- }
+ if (EnableVPlanNativePath)
+ fixNonInductionPHIs(Plan, State);
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
@@ -3946,24 +3583,37 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
- // If we inserted an edge from the middle block to the unique exit block,
- // update uses outside the loop (phis) to account for the newly inserted
- // edge.
- if (!Cost->requiresScalarEpilogue(VF)) {
+ VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
+ if (Cost->requiresScalarEpilogue(VF)) {
+ // No edge from the middle block to the unique exit block has been inserted
+ // and there is nothing to fix from vector loop; phis should have incoming
+ // from scalar loop only.
+ Plan.clearLiveOuts();
+ } else {
+ // If we inserted an edge from the middle block to the unique exit block,
+ // update uses outside the loop (phis) to account for the newly inserted
+ // edge.
+
// Fix-up external users of the induction variables.
for (auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
- getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
- IVEndValues[Entry.first], LoopMiddleBlock);
-
- fixLCSSAPHIs(State);
+ getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
+ IVEndValues[Entry.first], LoopMiddleBlock,
+ VectorLoop->getHeader(), Plan);
}
+ // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
+ // in the exit block, so update the builder.
+ State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+ for (auto &KV : Plan.getLiveOuts())
+ KV.second->fixPhi(Plan, State);
+
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
// Remove redundant induction instructions.
- cse(LoopVectorBody);
+ cse(VectorLoop->getHeader());
// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
@@ -3978,9 +3628,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// For scalable vectorization we can't know at compile time how many iterations
// of the loop are handled in one vector iteration, so instead assume a pessimistic
// vscale of '1'.
- setProfileInfoAfterUnrolling(
- LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
- LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
+ setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
+ LI->getLoopFor(LoopScalarBody),
+ VF.getKnownMinValue() * UF);
}
void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
@@ -3990,7 +3640,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
- VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
+ VPBasicBlock *Header =
+ State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
fixReduction(ReductionPhi, State);
@@ -4106,8 +3757,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(
// and thus no phis which needed updated.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+ State.Plan->removeLiveOut(&LCSSAPhi);
+ }
}
void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
@@ -4121,14 +3774,14 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- setDebugLocFromInst(ReductionStartValue);
+ State.setDebugLocFromInst(ReductionStartValue);
VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Wrap flags are in general invalid after vectorization, clear them.
- clearReductionWrapFlags(RdxDesc, State);
+ clearReductionWrapFlags(PhiR, State);
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
@@ -4136,9 +3789,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// instructions.
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
- setDebugLocFromInst(LoopExitInst);
+ State.setDebugLocFromInst(LoopExitInst);
Type *PhiTy = OrigPhi->getType();
+
+ VPBasicBlock *LatchVPBB =
+ PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
+ BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
@@ -4146,17 +3803,20 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
- Value *Sel = nullptr;
+ SelectInst *Sel = nullptr;
for (User *U : VecLoopExitInst->users()) {
if (isa<SelectInst>(U)) {
assert(!Sel && "Reduction exit feeding two selects");
- Sel = U;
+ Sel = cast<SelectInst>(U);
} else
assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
}
assert(Sel && "Reduction exit feeds no select");
State.reset(LoopExitInstDef, Sel, Part);
+ if (isa<FPMathOperator>(Sel))
+ Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
+
// If the target can create a predicated operator for the reduction at no
// extra cost in the loop (for example a predicated vadd), it can be
// cheaper for the select to remain in the loop than be sunk out of it,
@@ -4168,8 +3828,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
TargetTransformInfo::ReductionFlags())) {
auto *VecRdxPhi =
cast<PHINode>(State.get(PhiR, Part));
- VecRdxPhi->setIncomingValueForBlock(
- LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
+ VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
}
}
}
@@ -4180,8 +3839,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(
- LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
+ Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
VectorParts RdxParts(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = State.get(LoopExitInstDef, Part);
@@ -4212,7 +3870,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// conditional branch, and (c) other passes may add new predecessors which
// terminate on this line. This is the easiest way to ensure we don't
// accidentally cause an extra step back into the loop while debugging.
- setDebugLocFromInst(LoopMiddleBlock->getTerminator());
+ State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
if (PhiR->isOrdered())
ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
else {
@@ -4269,6 +3927,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// Set the resume value for this reduction
ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
+ // If there were stores of the reduction value to a uniform memory address
+ // inside the loop, create the final store here.
+ if (StoreInst *SI = RdxDesc.IntermediateStore) {
+ StoreInst *NewSI =
+ Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+ propagateMetadata(NewSI, SI);
+
+ // If the reduction value is used in other places,
+ // then let the code below create PHI's for that.
+ }
+
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
@@ -4277,8 +3946,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// fixFirstOrderRecurrence for a more complete explaination of the logic.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ State.Plan->removeLiveOut(&LCSSAPhi);
+ }
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
@@ -4291,63 +3962,35 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
-void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
VPTransformState &State) {
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind RK = RdxDesc.getRecurrenceKind();
if (RK != RecurKind::Add && RK != RecurKind::Mul)
return;
- Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
- assert(LoopExitInstr && "null loop exit instruction");
- SmallVector<Instruction *, 8> Worklist;
- SmallPtrSet<Instruction *, 8> Visited;
- Worklist.push_back(LoopExitInstr);
- Visited.insert(LoopExitInstr);
+ SmallVector<VPValue *, 8> Worklist;
+ SmallPtrSet<VPValue *, 8> Visited;
+ Worklist.push_back(PhiR);
+ Visited.insert(PhiR);
while (!Worklist.empty()) {
- Instruction *Cur = Worklist.pop_back_val();
- if (isa<OverflowingBinaryOperator>(Cur))
- for (unsigned Part = 0; Part < UF; ++Part) {
- // FIXME: Should not rely on getVPValue at this point.
- Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
- cast<Instruction>(V)->dropPoisonGeneratingFlags();
+ VPValue *Cur = Worklist.pop_back_val();
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *V = State.get(Cur, Part);
+ if (!isa<OverflowingBinaryOperator>(V))
+ break;
+ cast<Instruction>(V)->dropPoisonGeneratingFlags();
}
- for (User *U : Cur->users()) {
- Instruction *UI = cast<Instruction>(U);
- if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
- Visited.insert(UI).second)
- Worklist.push_back(UI);
- }
- }
-}
-
-void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
- // Some phis were already hand updated by the reduction and recurrence
- // code above, leave them alone.
- continue;
-
- auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
- // Non-instruction incoming values will have only one value.
-
- VPLane Lane = VPLane::getFirstLane();
- if (isa<Instruction>(IncomingValue) &&
- !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
- VF))
- Lane = VPLane::getLastLaneForVF(VF);
-
- // Can be a loop invariant incoming value or the last scalar value to be
- // extracted from the vectorized loop.
- // FIXME: Should not rely on getVPValue at this point.
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- Value *lastIncomingValue =
- OrigLoop->isLoopInvariant(IncomingValue)
- ? IncomingValue
- : State.get(State.Plan->getVPValue(IncomingValue, true),
- VPIteration(UF - 1, Lane));
- LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
+ for (VPUser *U : Cur->users()) {
+ auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!UserRecipe)
+ continue;
+ for (VPValue *V : UserRecipe->definedValues())
+ if (Visited.insert(V).second)
+ Worklist.push_back(V);
+ }
}
}
@@ -4425,17 +4068,23 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
} while (Changed);
}
-void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
- for (PHINode *OrigPhi : OrigPHIsToFix) {
- VPWidenPHIRecipe *VPPhi =
- cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
- PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
- // Make sure the builder has a valid insert point.
- Builder.SetInsertPoint(NewPhi);
- for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
- VPValue *Inc = VPPhi->getIncomingValue(i);
- VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
- NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
+ VPTransformState &State) {
+ auto Iter = depth_first(
+ VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+ for (VPRecipeBase &P : VPBB->phis()) {
+ VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
+ if (!VPPhi)
+ continue;
+ PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
+ // Make sure the builder has a valid insert point.
+ Builder.SetInsertPoint(NewPhi);
+ for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+ VPValue *Inc = VPPhi->getIncomingValue(i);
+ VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+ NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+ }
}
}
}
@@ -4445,139 +4094,6 @@ bool InnerLoopVectorizer::useOrderedReductions(
return Cost->useOrderedReductions(RdxDesc);
}
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
- VPWidenPHIRecipe *PhiR,
- VPTransformState &State) {
- PHINode *P = cast<PHINode>(PN);
- if (EnableVPlanNativePath) {
- // Currently we enter here in the VPlan-native path for non-induction
- // PHIs where all control flow is uniform. We simply widen these PHIs.
- // Create a vector phi with no operands - the vector phi operands will be
- // set at the end of vector code generation.
- Type *VecTy = (State.VF.isScalar())
- ? PN->getType()
- : VectorType::get(PN->getType(), State.VF);
- Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
- State.set(PhiR, VecPhi, 0);
- OrigPHIsToFix.push_back(P);
-
- return;
- }
-
- assert(PN->getParent() == OrigLoop->getHeader() &&
- "Non-header phis should have been handled elsewhere");
-
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #1: We create a new vector PHI node with no incoming edges. We'll use
- // this value when we vectorize all of the instructions that use the PHI.
-
- assert(!Legal->isReductionVariable(P) &&
- "reductions should be handled elsewhere");
-
- setDebugLocFromInst(P);
-
- // This PHINode must be an induction variable.
- // Make sure that we know about it.
- assert(Legal->getInductionVars().count(P) && "Not an induction variable");
-
- InductionDescriptor II = Legal->getInductionVars().lookup(P);
- const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-
- auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
- PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
-
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- switch (II.getKind()) {
- case InductionDescriptor::IK_NoInduction:
- llvm_unreachable("Unknown induction");
- case InductionDescriptor::IK_IntInduction:
- case InductionDescriptor::IK_FpInduction:
- llvm_unreachable("Integer/fp induction is handled elsewhere.");
- case InductionDescriptor::IK_PtrInduction: {
- // Handle the pointer induction variable case.
- assert(P->getType()->isPointerTy() && "Unexpected type.");
-
- if (Cost->isScalarAfterVectorization(P, State.VF)) {
- // This is the normalized GEP that starts counting at zero.
- Value *PtrInd =
- Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
- assert((IsUniform || !State.VF.isScalable()) &&
- "Cannot scalarize a scalable VF");
- unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *PartStart =
- createStepForVF(Builder, PtrInd->getType(), VF, Part);
-
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Value *Idx = Builder.CreateAdd(
- PartStart, ConstantInt::get(PtrInd->getType(), Lane));
- Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
- DL, II, State.CFG.PrevBB);
- SclrGep->setName("next.gep");
- State.set(PhiR, SclrGep, VPIteration(Part, Lane));
- }
- }
- return;
- }
- assert(isa<SCEVConstant>(II.getStep()) &&
- "Induction step not a SCEV constant!");
- Type *PhiType = II.getStep()->getType();
-
- // Build a pointer phi
- Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
- Type *ScStValueType = ScalarStartValue->getType();
- PHINode *NewPointerPhi =
- PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
- NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
-
- // A pointer induction, performed by using a gep
- BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
- Instruction *InductionLoc = LoopLatch->getTerminator();
- const SCEV *ScalarStep = II.getStep();
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- Value *ScalarStepValue =
- Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
- Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
- Value *NumUnrolledElems =
- Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
- Value *InductionGEP = GetElementPtrInst::Create(
- II.getElementType(), NewPointerPhi,
- Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
- InductionLoc);
- NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
-
- // Create UF many actual address geps that use the pointer
- // phi as base and a vectorized version of the step value
- // (<step*0, ..., step*N>) as offset.
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Type *VecPhiType = VectorType::get(PhiType, State.VF);
- Value *StartOffsetScalar =
- Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
- Value *StartOffset =
- Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
- // Create a vector of consecutive numbers from zero to VF.
- StartOffset =
- Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
-
- Value *GEP = Builder.CreateGEP(
- II.getElementType(), NewPointerPhi,
- Builder.CreateMul(
- StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
- "vector.gep"));
- State.set(PhiR, GEP, Part);
- }
- }
- }
-}
-
/// A helper function for checking whether an integer division-related
/// instruction may divide by zero (in which case it must be predicated if
/// executed conditionally in the scalar code).
@@ -4601,7 +4117,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
VPTransformState &State) {
assert(!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
- setDebugLocFromInst(&I);
+ State.setDebugLocFromInst(&I);
Module *M = I.getParent()->getParent()->getParent();
auto *CI = cast<CallInst>(&I);
@@ -4631,13 +4147,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
- if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+ if (!UseVectorIntrinsic ||
+ !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
Arg = State.get(I.value(), Part);
- else {
+ else
Arg = State.get(I.value(), VPIteration(0, 0));
- if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
- TysForDecl.push_back(Arg->getType());
- }
+ if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
+ TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
}
@@ -4665,7 +4181,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
V->copyFastMathFlags(CI);
State.set(Def, V, Part);
- addMetadata(V, &I);
+ State.addMetadata(V, &I);
}
}
@@ -4676,6 +4192,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF");
+ // This avoids any chances of creating a REPLICATE recipe during planning
+ // since that would result in generation of scalarized code during execution,
+ // which is not supported for scalable vectors.
+ if (VF.isScalable()) {
+ Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
+ return;
+ }
+
SmallSetVector<Instruction *, 8> Worklist;
// These sets are used to seed the analysis with pointers used by memory
@@ -4765,7 +4289,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
}
// Insert the forced scalars.
- // FIXME: Currently widenPHIInstruction() often creates a dead vector
+ // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
// induction variable when the PHI user is scalarized.
auto ForcedScalar = ForcedScalars.find(VF);
if (ForcedScalar != ForcedScalars.end())
@@ -4892,6 +4416,27 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
+ // If the group involves a non-integral pointer, we may not be able to
+ // losslessly cast all values to a common type.
+ unsigned InterleaveFactor = Group->getFactor();
+ bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ Instruction *Member = Group->getMember(i);
+ if (!Member)
+ continue;
+ auto *MemberTy = getLoadStoreType(Member);
+ bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
+ // Don't coerce non-integral pointers to integers or vice versa.
+ if (MemberNI != ScalarNI) {
+ // TODO: Consider adding special nullptr value case here
+ return false;
+ } else if (MemberNI && ScalarNI &&
+ ScalarTy->getPointerAddressSpace() !=
+ MemberTy->getPointerAddressSpace()) {
+ return false;
+ }
+ }
+
// Check if masking is required.
// A Group may need masking for one of two reasons: it resides in a block that
// needs predication, or it was decided to use masking to deal with gaps
@@ -5174,7 +4719,7 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return true;
}
- if (!PSE.getUnionPredicate().getPredicates().empty()) {
+ if (!PSE.getPredicate().isAlwaysTrue()) {
reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
"runtime SCEV checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
@@ -5465,14 +5010,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
}
- // For scalable vectors don't use tail folding for low trip counts or
- // optimizing for code size. We only permit this if the user has explicitly
- // requested it.
- if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
- ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
- MaxFactors.ScalableVF.isVector())
- MaxFactors.ScalableVF = ElementCount::getScalable(0);
-
// If we don't know the precise trip count, or if the trip count that we
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
@@ -5515,7 +5052,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
- const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
+ ElementCount MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5560,9 +5097,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return ElementCount::getFixed(ClampedConstTripCount);
}
+ TargetTransformInfo::RegisterKind RegKind =
+ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
- if (TTI.shouldMaximizeVectorBandwidth() ||
- (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+ TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
ComputeScalableMaxVF);
@@ -5600,6 +5140,11 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
MaxVF = MinVF;
}
}
+
+ // Invalidate any widening decisions we might have made, in case the loop
+ // requires prediction (decided later), but we have already made some
+ // load/store widening decisions.
+ invalidateCostModelingDecisions();
}
return MaxVF;
}
@@ -5667,7 +5212,8 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
assert(VFCandidates.count(ElementCount::getFixed(1)) &&
"Expected Scalar VF to be a candidate");
- const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
+ const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
+ ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
@@ -5685,12 +5231,12 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
continue;
VectorizationCostTy C = expectedCost(i, &InvalidCosts);
- VectorizationFactor Candidate(i, C.first);
+ VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
if (Optional<unsigned> VScale = getVScaleForTuning())
- AssumedMinimumVscale = VScale.getValue();
+ AssumedMinimumVscale = *VScale;
unsigned Width =
Candidate.Width.isScalable()
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
@@ -5878,7 +5424,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
if (LVP.hasPlanWithVF(ForcedEC))
- return {ForcedEC, 0};
+ return {ForcedEC, 0, 0};
else {
LLVM_DEBUG(
dbgs()
@@ -5908,7 +5454,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
if (Optional<unsigned> VScale = getVScaleForTuning())
- EstimatedRuntimeVF *= VScale.getValue();
+ EstimatedRuntimeVF *= *VScale;
}
for (auto &NextVF : ProfitableVFs)
@@ -6144,9 +5690,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
return IC;
}
- // Note that if we've already vectorized the loop we will have done the
- // runtime check and so interleaving won't require further checks.
- bool InterleavingRequiresRuntimePointerCheck =
+ // For any scalar loop that either requires runtime checks or predication we
+ // are better off leaving this to the unroller. Note that if we've already
+ // vectorized the loop we will have done the runtime check and so interleaving
+ // won't require further checks.
+ bool ScalarInterleavingRequiresPredication =
+ (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
+ return Legal->blockNeedsPredication(BB);
+ }));
+ bool ScalarInterleavingRequiresRuntimePointerCheck =
(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
// We want to interleave small loops in order to reduce the loop overhead and
@@ -6156,7 +5708,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
<< "LV: VF is " << VF << '\n');
const bool AggressivelyInterleaveReductions =
TTI.enableAggressiveInterleaving(HasReductions);
- if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+ if (!ScalarInterleavingRequiresRuntimePointerCheck &&
+ !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.
@@ -6319,16 +5872,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
- // A lambda that gets the register usage for the given type and VF.
- const auto &TTICapture = TTI;
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+ auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0;
- InstructionCost::CostType RegUsage =
- *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
- assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
- "Nonsensical values for register usage.");
- return RegUsage;
+ return TTI.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -7079,10 +6626,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool TypeNotScalarized = false;
if (VF.isVector() && VectorTy->isVectorTy()) {
- unsigned NumParts = TTI.getNumberOfParts(VectorTy);
- if (NumParts)
- TypeNotScalarized = NumParts < VF.getKnownMinValue();
- else
+ if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
+ if (VF.isScalable())
+ // <vscale x 1 x iN> is assumed to be profitable over iN because
+ // scalable registers are a distinct register class from scalar ones.
+ // If we ever find a target which wants to lower scalable vectors
+ // back to scalars, we'll need to update this code to explicitly
+ // ask TTI about the register class uses for each part.
+ TypeNotScalarized = NumParts <= VF.getKnownMinValue();
+ else
+ TypeNotScalarized = NumParts < VF.getKnownMinValue();
+ } else
C = InstructionCost::getInvalid();
}
return VectorizationCostTy(C, TypeNotScalarized);
@@ -7158,8 +6712,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
Cost = getGatherScatterCost(&I, VF);
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
} else {
- assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
- "Cannot yet scalarize uniform stores");
Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
}
@@ -7517,8 +7069,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
InstWidening Decision = getWideningDecision(I, Width);
assert(Decision != CM_Unknown &&
"CM decision should be taken at this point");
- if (Decision == CM_Scalarize)
+ if (Decision == CM_Scalarize) {
+ if (VF.isScalable() && isa<StoreInst>(I))
+ // We can't scalarize a scalable vector store (even a uniform one
+ // currently), return an invalid cost so as to prevent vectorization.
+ return InstructionCost::getInvalid();
Width = ElementCount::getFixed(1);
+ }
}
VectorTy = ToVectorTy(getLoadStoreType(I), Width);
return getMemoryInstructionCost(I, VF);
@@ -7686,6 +7243,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+ // Find all stores to invariant variables. Since they are going to sink
+ // outside the loop we do not need calculate cost for them.
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ ValuesToIgnore.insert(&I);
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
@@ -7787,7 +7354,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
if (VPlanBuildStressTest)
return VectorizationFactor::Disabled();
- return {VF, 0 /*Cost*/};
+ return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
}
LLVM_DEBUG(
@@ -7796,6 +7363,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
return VectorizationFactor::Disabled();
}
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
+ unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+ return (NumRuntimePointerChecks >
+ VectorizerParams::RuntimeMemoryCheckThreshold &&
+ !Hints.allowReordering()) ||
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+}
+
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7830,7 +7405,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
LLVM_DEBUG(printPlans(dbgs()));
- return {{UserVF, 0}};
+ return {{UserVF, 0, 0}};
} else
reportVectorizationInfo("UserVF ignored because of invalid costs.",
"InvalidCost", ORE, OrigLoop);
@@ -7864,30 +7439,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
-
- // Check if it is profitable to vectorize with runtime checks.
- unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
- if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysisAliasing(
- DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
- OrigLoop->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Hints.emitRemarkWithHints();
- return VectorizationFactor::Disabled();
- }
- }
- return SelectedVF;
+ return CM.selectVectorizationFactor(VFCandidates);
}
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -7940,17 +7492,36 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
VPlan &BestVPlan,
InnerLoopVectorizer &ILV,
- DominatorTree *DT) {
+ DominatorTree *DT,
+ bool IsEpilogueVectorization) {
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');
// Perform the actual loop transformation.
- // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ // 1. Set up the skeleton for vectorization, including vector pre-header and
+ // middle block. The vector loop is created during VPlan execution.
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
Value *CanonicalIVStartValue;
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
ILV.createVectorizedLoopSkeleton();
+
+ // Only use noalias metadata when using memory checks guaranteeing no overlap
+ // across all iterations.
+ const LoopAccessInfo *LAI = ILV.Legal->getLAI();
+ if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
+ !LAI->getRuntimePointerChecking()->getDiffChecks()) {
+
+ // We currently don't use LoopVersioning for the actual loop cloning but we
+ // still use it to add the noalias metadata.
+ // TODO: Find a better way to re-use LoopVersioning functionality to add
+ // metadata.
+ State.LVer = std::make_unique<LoopVersioning>(
+ *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
+ PSE.getSE());
+ State.LVer->prepareNoAliasMetadata();
+ }
+
ILV.collectPoisonGeneratingRecipes(State);
ILV.printDebugTracesAtStart();
@@ -7966,7 +7537,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 2. Copy and widen instructions from the old loop into the new loop.
BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State);
+ CanonicalIVStartValue, State,
+ IsEpilogueVectorization);
+
BestVPlan.execute(&State);
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -7977,8 +7550,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
- Loop *L = LI->getLoopFor(State.CFG.PrevBB);
- if (VectorizedLoopID.hasValue())
+ VPBasicBlock *HeaderVPBB =
+ BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
+ Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+ if (VectorizedLoopID)
L->setLoopID(VectorizedLoopID.getValue());
else {
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -7995,7 +7570,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
- ILV.fixVectorizedLoop(State);
+ ILV.fixVectorizedLoop(State, BestVPlan);
ILV.printDebugTracesAtEnd();
}
@@ -8066,22 +7641,31 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
std::pair<BasicBlock *, Value *>
EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
- Loop *Lp = createVectorLoopSkeleton("");
+
+ // Workaround! Compute the trip count of the original loop and cache it
+ // before we start modifying the CFG. This code has a systemic problem
+ // wherein it tries to run analysis over partially constructed IR; this is
+ // wrong, and not simply for SCEV. The trip count of the original loop
+ // simply happens to be prone to hitting this in practice. In theory, we
+ // can hit the same issue for any SCEV, or ValueTracking query done during
+ // mutation. See PR49900.
+ getOrCreateTripCount(OrigLoop->getLoopPreheader());
+ createVectorLoopSkeleton("");
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
EPI.EpilogueIterationCountCheck =
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+ emitIterationCountCheck(LoopScalarPreHeader, true);
EPI.EpilogueIterationCountCheck->setName("iter.check");
// Generate the code to check any assumptions that we've made for SCEV
// expressions.
- EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
+ EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
// Generate the code that checks at runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+ EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
@@ -8090,19 +7674,17 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
// trip count. Note: the branch will get updated later on when we vectorize
// the epilogue.
EPI.MainLoopIterationCountCheck =
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+ emitIterationCountCheck(LoopScalarPreHeader, false);
// Generate the induction variable.
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- EPI.VectorTripCount = CountRoundDown;
- createHeaderBranch(Lp);
+ EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
// Skip induction resume value creation here because they will be created in
// the second pass. If we created them here, they wouldn't be used anyway,
// because the vplan in the second pass still contains the inductions from the
// original loop.
- return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
+ return {completeLoopSkeleton(OrigLoopID), nullptr};
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -8122,13 +7704,13 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
});
}
-BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
- Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
- assert(L && "Expected valid Loop.");
+BasicBlock *
+EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
+ bool ForEpilogue) {
assert(Bypass && "Expected valid bypass basic block.");
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
- Value *Count = getOrCreateTripCount(L);
+ Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -8187,7 +7769,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
- Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+ createVectorLoopSkeleton("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
@@ -8196,7 +7778,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
LoopVectorPreHeader =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
LI, nullptr, "vec.epilog.ph");
- emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+ emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
// Adjust the control flow taking the state info from the main loop
@@ -8268,9 +7850,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
EPI.MainLoopIterationCountCheck);
- // Generate the induction variable.
- createHeaderBranch(Lp);
-
// Generate induction resume values. These variables save the new starting
// indexes for the scalar loop. They are used to test if there are any tail
// iterations left once the vector loop has completed.
@@ -8278,15 +7857,15 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
- createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
- EPI.VectorTripCount} /* AdditionalBypass */);
+ createInductionResumeValues({VecEpilogueIterationCountCheck,
+ EPI.VectorTripCount} /* AdditionalBypass */);
- return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
+ return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
}
BasicBlock *
EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
- Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+ BasicBlock *Bypass, BasicBlock *Insert) {
assert(EPI.TripCount &&
"Expected trip count to have been safed in the first pass.");
@@ -8427,7 +8006,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// constructing the desired canonical IV in the header block as its first
// non-phi instructions.
assert(CM.foldTailByMasking() && "must fold the tail");
- VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
+ VPBasicBlock *HeaderVPBB =
+ Plan->getVectorLoopRegion()->getEntryBasicBlock();
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
@@ -8469,8 +8049,6 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
"Must be called with either a load or store");
auto willWiden = [&](ElementCount VF) -> bool {
- if (VF.isScalar())
- return false;
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, VF);
assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
@@ -8507,11 +8085,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
Mask, Consecutive, Reverse);
}
-static VPWidenIntOrFpInductionRecipe *
-createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
- VPValue *Start, const InductionDescriptor &IndDesc,
- LoopVectorizationCostModel &CM, Loop &OrigLoop,
- VFRange &Range) {
+/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
+/// insert a recipe to expand the step for the induction recipe.
+static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
+ PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
+ const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
// Returns true if an instruction \p I should be scalarized instead of
// vectorized for the chosen vectorization factor.
auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
@@ -8519,18 +8098,6 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
CM.isProfitableToScalarize(I, VF);
};
- bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) {
- // Returns true if we should generate a scalar version of \p IV.
- if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
- return true;
- auto isScalarInst = [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
- };
- return any_of(PhiOrTrunc->users(), isScalarInst);
- },
- Range);
bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
return ShouldScalarizeInstruction(PhiOrTrunc, VF);
@@ -8538,30 +8105,38 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
Range);
assert(IndDesc.getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
+ assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
+ "step must be loop invariant");
+
+ VPValue *Step =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
- NeedsScalarIV, !NeedsScalarIVOnly);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
+ !NeedsScalarIVOnly);
}
assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
!NeedsScalarIVOnly);
}
-VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
- PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
+VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
+ PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
- return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
- Range);
+ return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
+ *PSE.getSE(), *OrigLoop, Range);
+ // Check if this is pointer induction. If so, build the recipe for it.
+ if (auto *II = Legal->getPointerInductionDescriptor(Phi))
+ return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
+ *PSE.getSE());
return nullptr;
}
VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
- TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
- VPlan &Plan) const {
+ TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8582,7 +8157,8 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
auto *Phi = cast<PHINode>(I->getOperand(0));
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
+ return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
+ *PSE.getSE(), *OrigLoop, Range);
}
return nullptr;
}
@@ -8599,13 +8175,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
return Operands[0];
}
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+ // For in-loop reductions, we do not need to create an additional select.
+ VPValue *InLoopVal = nullptr;
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ PHINode *PhiOp =
+ dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
+ if (PhiOp && CM.isInLoopReduction(PhiOp)) {
+ assert(!InLoopVal && "Found more than one in-loop reduction!");
+ InLoopVal = Operands[In];
+ }
+ }
+
+ assert((!InLoopVal || NumIncoming == 2) &&
+ "Found an in-loop reduction for PHI with unexpected number of "
+ "incoming values");
+ if (InLoopVal)
+ return Operands[Operands[0] == InLoopVal ? 1 : 0];
+
// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
SmallVector<VPValue *, 2> OperandsWithMask;
- unsigned NumIncoming = Phi->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
@@ -8711,6 +8304,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
case Instruction::URem:
case Instruction::Xor:
case Instruction::ZExt:
+ case Instruction::Freeze:
return true;
}
return false;
@@ -8836,14 +8430,14 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
Plan->removeVPValueFor(Instr);
Plan->addVPValue(Instr, PHIRecipe);
}
- auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
- VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
- VPBlockUtils::connectBlocks(Pred, Exit);
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exiting);
return Region;
}
@@ -8852,52 +8446,37 @@ VPRecipeOrVPValueTy
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
VFRange &Range, VPlanPtr &Plan) {
- // First, check for specific widening recipes that deal with calls, memory
- // operations, inductions and Phi nodes.
- if (auto *CI = dyn_cast<CallInst>(Instr))
- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
-
- if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
-
+ // First, check for specific widening recipes that deal with inductions, Phi
+ // nodes, calls and memory operations.
VPRecipeBase *Recipe;
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
return tryToBlend(Phi, Operands, Plan);
- if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
return toVPRecipeResult(Recipe);
VPHeaderPHIRecipe *PhiRecipe = nullptr;
- if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
- VPValue *StartV = Operands[0];
- if (Legal->isReductionVariable(Phi)) {
- const RecurrenceDescriptor &RdxDesc =
- Legal->getReductionVars().find(Phi)->second;
- assert(RdxDesc.getRecurrenceStartValue() ==
- Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
- CM.isInLoopReduction(Phi),
- CM.useOrderedReductions(RdxDesc));
- } else {
- PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
- }
-
- // Record the incoming value from the backedge, so we can add the incoming
- // value from the backedge after all recipes have been created.
- recordRecipeOf(cast<Instruction>(
- Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
- PhisToFix.push_back(PhiRecipe);
+ assert((Legal->isReductionVariable(Phi) ||
+ Legal->isFirstOrderRecurrence(Phi)) &&
+ "can only widen reductions and first-order recurrences here");
+ VPValue *StartV = Operands[0];
+ if (Legal->isReductionVariable(Phi)) {
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
+ assert(RdxDesc.getRecurrenceStartValue() ==
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+ PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+ CM.isInLoopReduction(Phi),
+ CM.useOrderedReductions(RdxDesc));
} else {
- // TODO: record backedge value for remaining pointer induction phis.
- assert(Phi->getType()->isPointerTy() &&
- "only pointer phis should be handled here");
- assert(Legal->getInductionVars().count(Phi) &&
- "Not an induction variable");
- InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
- VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
- PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
+ PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
}
+ // Record the incoming value from the backedge, so we can add the incoming
+ // value from the backedge after all recipes have been created.
+ recordRecipeOf(cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+ PhisToFix.push_back(PhiRecipe);
return toVPRecipeResult(PhiRecipe);
}
@@ -8906,6 +8485,17 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Range, *Plan)))
return toVPRecipeResult(Recipe);
+ // All widen recipes below deal only with VF > 1.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return nullptr;
+
+ if (auto *CI = dyn_cast<CallInst>(Instr))
+ return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+
+ if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+ return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8979,15 +8569,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
// BranchOnCount VPInstruction to the latch.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- bool HasNUW, bool IsVPlanNative) {
+ bool HasNUW) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddVPValue(StartIdx);
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
- if (IsVPlanNative)
- Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
Header->insert(CanonicalIVPHI, Header->begin());
auto *CanonicalIVIncrement =
@@ -8996,11 +8584,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
{CanonicalIVPHI}, DL);
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
- VPBasicBlock *EB = TopRegion->getExitBasicBlock();
- if (IsVPlanNative) {
- EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
- EB->setCondBit(nullptr);
- }
+ VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
EB->appendRecipe(CanonicalIVIncrement);
auto *BranchOnCount =
@@ -9009,6 +8593,26 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
EB->appendRecipe(BranchOnCount);
}
+// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
+// original exit block.
+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
+ VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
+ VPlan &Plan) {
+ BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
+ BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
+ // Only handle single-exit loops with unique exit blocks for now.
+ if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
+ return;
+
+ // Introduce VPUsers modeling the exit values.
+ for (PHINode &ExitPhi : ExitBB->phis()) {
+ Value *IncomingValue =
+ ExitPhi.getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
+ Plan.addLiveOut(&ExitPhi, V);
+ }
+}
+
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const MapVector<Instruction *, Instruction *> &SinkAfter) {
@@ -9037,7 +8641,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder.recordRecipeOf(Phi);
for (auto &R : ReductionOperations) {
RecipeBuilder.recordRecipeOf(R);
- // For min/max reducitons, where we have a pair of icmp/select, we also
+ // For min/max reductions, where we have a pair of icmp/select, we also
// need to record the ICmp recipe, so it can be removed later.
assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
"Only min/max recurrences allowed for inloop reductions");
@@ -9069,18 +8673,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- // Create initial VPlan skeleton, with separate header and latch blocks.
- VPBasicBlock *HeaderVPBB = new VPBasicBlock();
+ // Create initial VPlan skeleton, starting with a block for the pre-header,
+ // followed by a region for the vector loop, followed by the middle block. The
+ // skeleton vector loop region contains a header and latch block.
+ VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
+ auto Plan = std::make_unique<VPlan>(Preheader);
+
+ VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
- auto Plan = std::make_unique<VPlan>(TopRegion);
+ VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
+ VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+ VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
Instruction *DLInst =
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- !CM.foldTailByMasking(), false);
+ !CM.foldTailByMasking());
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -9093,11 +8704,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
- VPBB->setName(BB->getName());
+ if (VPBB != HeaderVPBB)
+ VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
- // TODO: Model and preserve debug instrinsics in VPlan.
+ // TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &I : BB->instructionsWithoutDebug()) {
Instruction *Instr = &I;
@@ -9115,6 +8727,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
auto OpRange = Plan->mapToVPValues(Instr->operands());
Operands = {OpRange.begin(), OpRange.end()};
}
+
+ // Invariant stores inside loop will be deleted and a single store
+ // with the final reduction value will be added to the exit block
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ continue;
+
if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
Instr, Operands, Range, Plan)) {
// If Instr can be simplified to an existing VPValue, use it.
@@ -9165,14 +8785,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
+ HeaderVPBB->setName("vector.body");
+
// Fold the last, empty block into its predecessor.
VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
assert(VPBB && "expected to fold last (empty) block");
// After here, VPBB should not be used.
VPBB = nullptr;
- assert(isa<VPRegionBlock>(Plan->getEntry()) &&
- !Plan->getEntry()->getEntryBasicBlock()->empty() &&
+ addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+
+ assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+ !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
@@ -9252,12 +8876,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
+ adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
RecipeBuilder, Range.Start);
// Introduce a recipe to combine the incoming and previous values of a
// first-order recurrence.
- for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+ for (VPRecipeBase &R :
+ Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
if (!RecurPhi)
continue;
@@ -9317,13 +8942,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- // From this point onwards, VPlan-to-VPlan transformations may change the plan
- // in ways that accessing values using original IR values is incorrect.
- Plan->disableValue2VPValue();
-
- VPlanTransforms::sinkScalarOperands(*Plan);
- VPlanTransforms::mergeReplicateRegions(*Plan);
-
std::string PlanName;
raw_string_ostream RSO(PlanName);
ElementCount VF = Range.Start;
@@ -9337,10 +8955,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RSO.flush();
Plan->setName(PlanName);
+ // From this point onwards, VPlan-to-VPlan transformations may change the plan
+ // in ways that accessing values using original IR values is incorrect.
+ Plan->disableValue2VPValue();
+
+ VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
+ VPlanTransforms::sinkScalarOperands(*Plan);
+ VPlanTransforms::mergeReplicateRegions(*Plan);
+ VPlanTransforms::removeDeadRecipes(*Plan);
+ VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
+
// Fold Exit block into its predecessor if possible.
// TODO: Fold block earlier once all VPlan transforms properly maintain a
// VPBasicBlock as exit.
- VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
+ VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
@@ -9365,23 +8993,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
VF *= 2)
Plan->addVF(VF);
- if (EnableVPlanPredication) {
- VPlanPredicator VPP(*Plan);
- VPP.predicate();
-
- // Avoid running transformation to recipes until masked code generation in
- // VPlan-native path is in place.
- return Plan;
- }
-
SmallPtrSet<Instruction *, 1> DeadInstructions;
VPlanTransforms::VPInstructionsToVPRecipes(
OrigLoop, Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
DeadInstructions, *PSE.getSE());
+ // Remove the existing terminator of the exiting block of the top-most region.
+ // A BranchOnCount will be added instead when adding the canonical IV recipes.
+ auto *Term =
+ Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
+ Term->eraseFromParent();
+
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- true, true);
+ true);
return Plan;
}
@@ -9433,7 +9058,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
- auto *CondOp = CM.foldTailByMasking()
+ auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
: nullptr;
@@ -9453,9 +9078,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
- // Append the recipe to the end of the VPBasicBlock because we need to
- // ensure that it comes after all of it's inputs, including CondOp.
- WidenRecipe->getParent()->appendRecipe(RedRecipe);
+ WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
WidenRecipe->eraseFromParent();
@@ -9477,7 +9100,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// dedicated latch block.
if (CM.foldTailByMasking()) {
Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
- for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+ for (VPRecipeBase &R :
+ Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || PhiR->isInLoop())
continue;
@@ -9529,7 +9153,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
void VPWidenSelectRecipe::execute(VPTransformState &State) {
auto &I = *cast<SelectInst>(getUnderlyingInstr());
- State.ILV->setDebugLocFromInst(&I);
+ State.setDebugLocFromInst(&I);
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
@@ -9544,7 +9168,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
Value *Op1 = State.get(getOperand(2), Part);
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
State.set(this, Sel, Part);
- State.ILV->addMetadata(Sel, &I);
+ State.addMetadata(Sel, &I);
}
}
@@ -9578,7 +9202,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::Or:
case Instruction::Xor: {
// Just widen unops and binops.
- State.ILV->setDebugLocFromInst(&I);
+ State.setDebugLocFromInst(&I);
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Value *, 2> Ops;
@@ -9601,17 +9225,28 @@ void VPWidenRecipe::execute(VPTransformState &State) {
// Use this vector value for all users of the original instruction.
State.set(this, V, Part);
- State.ILV->addMetadata(V, &I);
+ State.addMetadata(V, &I);
}
break;
}
+ case Instruction::Freeze: {
+ State.setDebugLocFromInst(&I);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Op = State.get(getOperand(0), Part);
+
+ Value *Freeze = Builder.CreateFreeze(Op);
+ State.set(this, Freeze, Part);
+ }
+ break;
+ }
case Instruction::ICmp:
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
bool FCmp = (I.getOpcode() == Instruction::FCmp);
auto *Cmp = cast<CmpInst>(&I);
- State.ILV->setDebugLocFromInst(Cmp);
+ State.setDebugLocFromInst(Cmp);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
@@ -9625,7 +9260,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
State.set(this, C, Part);
- State.ILV->addMetadata(C, &I);
+ State.addMetadata(C, &I);
}
break;
@@ -9644,7 +9279,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
auto *CI = cast<CastInst>(&I);
- State.ILV->setDebugLocFromInst(CI);
+ State.setDebugLocFromInst(CI);
/// Vectorize casts.
Type *DestTy = (State.VF.isScalar())
@@ -9655,7 +9290,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
Value *A = State.get(getOperand(0), Part);
Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
State.set(this, Cast, Part);
- State.ILV->addMetadata(Cast, &I);
+ State.addMetadata(Cast, &I);
}
break;
}
@@ -9691,7 +9326,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
State.set(this, EntryPart, Part);
- State.ILV->addMetadata(EntryPart, GEP);
+ State.addMetadata(EntryPart, GEP);
}
} else {
// If the GEP has at least one loop-varying operand, we are sure to
@@ -9729,32 +9364,276 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
// but it should be a vector, otherwise.
- auto *NewGEP = IsInBounds
- ? State.Builder.CreateInBoundsGEP(
- GEP->getSourceElementType(), Ptr, Indices)
- : State.Builder.CreateGEP(GEP->getSourceElementType(),
- Ptr, Indices);
+ auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
+ Indices, "", IsInBounds);
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
State.set(this, NewGEP, Part);
- State.ILV->addMetadata(NewGEP, GEP);
+ State.addMetadata(NewGEP, GEP);
}
}
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
- State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
+
+ Value *Start = getStartValue()->getLiveInIRValue();
+ const InductionDescriptor &ID = getInductionDescriptor();
+ TruncInst *Trunc = getTruncInst();
+ IRBuilderBase &Builder = State.Builder;
+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(State.VF.isVector() && "must have vector VF");
+
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
+
+ // Now do the actual transformations, and start with fetching the step value.
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ if (isa<TruncInst>(EntryVal)) {
+ assert(Start->getType()->isIntegerTy() &&
+ "Truncation requires an integer type");
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
+ Step = Builder.CreateTrunc(Step, TruncType);
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+
+ Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
+ Value *SteppedStart = getStepVector(
+ SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+
+ // We create vector phi nodes for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (Step->getType()->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = ID.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the vectorization factor by the step using integer or
+ // floating-point arithmetic as appropriate.
+ Type *StepType = Step->getType();
+ Value *RuntimeVF;
+ if (Step->getType()->isFloatingPointTy())
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+ else
+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+ Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+ // Create a vector splat to use in the induction update.
+ //
+ // FIXME: If the step is non-constant, we create the vector splat with
+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+ // handle a constant vector splat.
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(State.VF, Mul);
+ Builder.restoreIP(CurrIP);
+
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+ &*State.CFG.PrevBB->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ Instruction *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ State.set(this, LastInduction, Part);
+
+ if (isa<TruncInst>(EntryVal))
+ State.addMetadata(LastInduction, EntryVal);
+
+ LastInduction = cast<Instruction>(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ }
+
+ LastInduction->setName("vec.ind.next");
+ VecInd->addIncoming(SteppedStart, VectorPH);
+ // Add induction update using an incorrect block temporarily. The phi node
+ // will be fixed after VPlan execution. Note that at this point the latch
+ // block cannot be used, as it does not exist yet.
+ // TODO: Model increment value in VPlan, by turning the recipe into a
+ // multi-def and a subclass of VPHeaderPHIRecipe.
+ VecInd->addIncoming(LastInduction, VectorPH);
+}
+
+void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
+ assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
+ "Not a pointer induction according to InductionDescriptor!");
+ assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
+ "Unexpected type.");
+
+ auto *IVR = getParent()->getPlan()->getCanonicalIV();
+ PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+
+ if (onlyScalarsGenerated(State.VF)) {
+ // This is the normalized GEP that starts counting at zero.
+ Value *PtrInd = State.Builder.CreateSExtOrTrunc(
+ CanonicalIV, IndDesc.getStep()->getType());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ bool IsUniform = vputils::onlyFirstLaneUsed(this);
+ assert((IsUniform || !State.VF.isScalable()) &&
+ "Cannot scalarize a scalable VF");
+ unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *PartStart =
+ createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
+
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Value *Idx = State.Builder.CreateAdd(
+ PartStart, ConstantInt::get(PtrInd->getType(), Lane));
+ Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
+
+ Value *Step = CreateStepValue(IndDesc.getStep(), SE,
+ State.CFG.PrevBB->getTerminator());
+ Value *SclrGep = emitTransformedIndex(
+ State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
+ SclrGep->setName("next.gep");
+ State.set(this, SclrGep, VPIteration(Part, Lane));
+ }
+ }
+ return;
+ }
+
+ assert(isa<SCEVConstant>(IndDesc.getStep()) &&
+ "Induction step not a SCEV constant!");
+ Type *PhiType = IndDesc.getStep()->getType();
+
+ // Build a pointer phi
+ Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
+ Type *ScStValueType = ScalarStartValue->getType();
+ PHINode *NewPointerPhi =
+ PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+
+ // A pointer induction, performed by using a gep
+ const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
+ Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
+
+ const SCEV *ScalarStep = IndDesc.getStep();
+ SCEVExpander Exp(SE, DL, "induction");
+ Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+ Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
+ Value *NumUnrolledElems =
+ State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
+ Value *InductionGEP = GetElementPtrInst::Create(
+ IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+ InductionLoc);
+ // Add induction update using an incorrect block temporarily. The phi node
+ // will be fixed after VPlan execution. Note that at this point the latch
+ // block cannot be used, as it does not exist yet.
+ // TODO: Model increment value in VPlan, by turning the recipe into a
+ // multi-def and a subclass of VPHeaderPHIRecipe.
+ NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+
+ // Create UF many actual address geps that use the pointer
+ // phi as base and a vectorized version of the step value
+ // (<step*0, ..., step*N>) as offset.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Type *VecPhiType = VectorType::get(PhiType, State.VF);
+ Value *StartOffsetScalar =
+ State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
+ Value *StartOffset =
+ State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
+ // Create a vector of consecutive numbers from zero to VF.
+ StartOffset = State.Builder.CreateAdd(
+ StartOffset, State.Builder.CreateStepVector(VecPhiType));
+
+ Value *GEP = State.Builder.CreateGEP(
+ IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.CreateMul(
+ StartOffset,
+ State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
+ "vector.gep"));
+ State.set(this, GEP, Part);
+ }
}
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
- State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
- State);
+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
+
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
+ if (IndDesc.getInductionBinOp() &&
+ isa<FPMathOperator>(IndDesc.getInductionBinOp()))
+ State.Builder.setFastMathFlags(
+ IndDesc.getInductionBinOp()->getFastMathFlags());
+
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+ auto CreateScalarIV = [&](Value *&Step) -> Value * {
+ Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+ auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
+ if (!isCanonical() || CanonicalIV->getType() != Ty) {
+ ScalarIV =
+ Ty->isIntegerTy()
+ ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
+ : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
+ ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
+ getStartValue()->getLiveInIRValue(), Step,
+ IndDesc);
+ ScalarIV->setName("offset.idx");
+ }
+ if (TruncToTy) {
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
+ Step = State.Builder.CreateTrunc(Step, TruncToTy);
+ }
+ return ScalarIV;
+ };
+
+ Value *ScalarIV = CreateScalarIV(Step);
+ if (State.VF.isVector()) {
+ buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
+ return;
+ }
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
+ Value *EntryPart;
+ if (Step->getType()->isFloatingPointTy()) {
+ Value *StartIdx =
+ getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
+ // Floating-point operations inherit FMF via the builder's flags.
+ Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
+ EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
+ ScalarIV, MulOp);
+ } else {
+ Value *StartIdx =
+ getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
+ EntryPart = State.Builder.CreateAdd(
+ ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
+ }
+ State.set(this, EntryPart, Part);
+ }
}
void VPBlendRecipe::execute(VPTransformState &State) {
- State.ILV->setDebugLocFromInst(Phi, &State.Builder);
+ State.setDebugLocFromInst(Phi);
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
@@ -10015,7 +9894,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// Handle Stores:
if (SI) {
- State.ILV->setDebugLocFromInst(SI);
+ State.setDebugLocFromInst(SI);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Instruction *NewSI = nullptr;
@@ -10041,14 +9920,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
else
NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
}
- State.ILV->addMetadata(NewSI, SI);
+ State.addMetadata(NewSI, SI);
}
return;
}
// Handle loads.
assert(LI && "Must have a load instruction");
- State.ILV->setDebugLocFromInst(LI);
+ State.setDebugLocFromInst(LI);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
@@ -10056,7 +9935,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
Value *VectorGep = State.get(getAddr(), Part);
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
nullptr, "wide.masked.gather");
- State.ILV->addMetadata(NewLI, LI);
+ State.addMetadata(NewLI, LI);
} else {
auto *VecPtr =
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
@@ -10069,12 +9948,12 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
// Add metadata to the load, but setVectorValue to the reverse shuffle.
- State.ILV->addMetadata(NewLI, LI);
+ State.addMetadata(NewLI, LI);
if (Reverse)
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
}
- State.set(this, NewLI, Part);
+ State.set(getVPSingleValue(), NewLI, Part);
}
}
@@ -10155,7 +10034,8 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
// At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
- assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
+ assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
+ isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -10237,8 +10117,7 @@ static bool processLoopInVPlanNativePath(
// If we are stress testing VPlan builds, do not attempt to generate vector
// code. Masked vector code generation support will follow soon.
// Also, do not attempt to vectorize if no vector code will be produced.
- if (VPlanBuildStressTest || EnableVPlanPredication ||
- VectorizationFactor::Disabled() == VF)
+ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
return false;
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
@@ -10250,7 +10129,7 @@ static bool processLoopInVPlanNativePath(
&CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
+ LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -10318,8 +10197,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
const std::string DebugLocStr = getDebugLocString(L);
#endif /* NDEBUG */
- LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
- << L->getHeader()->getParent()->getName() << "\" from "
+ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
+ << L->getHeader()->getParent()->getName() << "' from "
<< DebugLocStr << "\n");
LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
@@ -10474,10 +10353,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+ F->getParent()->getDataLayout());
if (MaybeVF) {
+ if (LVP.requiresTooManyRuntimeChecks()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(
+ DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
VF = *MaybeVF;
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+
+ unsigned SelectedIC = std::max(IC, UserIC);
+ // Optimistically generate runtime checks if they are needed. Drop them if
+ // they turn out to not be profitable.
+ if (VF.Width.isVector() || SelectedIC > 1)
+ Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
}
// Identify the diagnostic messages that should be produced.
@@ -10565,14 +10464,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool DisableRuntimeUnroll = false;
MDNode *OrigLoopID = L->getLoopID();
{
- // Optimistically generate runtime checks. Drop them if they turn out to not
- // be profitable. Limit the scope of Checks, so the cleanup happens
- // immediately after vector codegeneration is done.
- GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
- F->getParent()->getDataLayout());
- if (!VF.Width.isScalar() || IC > 1)
- Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
-
using namespace ore;
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -10582,7 +10473,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&CM, BFI, PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
+ LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10607,12 +10498,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
- DT);
+ DT, true);
++LoopsVectorized;
- simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
- formLCSSARecursively(*L, *DT, LI, SE);
-
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
@@ -10622,23 +10510,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Checks);
VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+ VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
+ Header->setName("vec.epilog.vector.body");
// Ensure that the start values for any VPReductionPHIRecipes are
// updated before vectorising the epilogue loop.
- VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
if (auto *Resume = MainILV.getReductionResumeValue(
ReductionPhi->getRecurrenceDescriptor())) {
- VPValue *StartVal = new VPValue(Resume);
- BestEpiPlan.addExternalDef(StartVal);
+ VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
ReductionPhi->setOperand(0, StartVal);
}
}
}
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT);
+ DT, true);
++LoopsEpilogueVectorized;
if (!MainILV.areSafetyChecksAdded())
@@ -10648,7 +10537,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&LVL, &CM, BFI, PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
+ LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there
@@ -10674,7 +10563,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
- if (RemainderLoopID.hasValue()) {
+ if (RemainderLoopID) {
L->setLoopID(RemainderLoopID.getValue());
} else {
if (DisableRuntimeUnroll)
@@ -10756,8 +10645,12 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
PreservedAnalyses LoopVectorizePass::run(Function &F,
FunctionAnalysisManager &AM) {
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &LI = AM.getResult<LoopAnalysis>(F);
+ // There are no loops in the function. Return before computing other expensive
+ // analyses.
+ if (LI.empty())
+ return PreservedAnalyses::all();
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 644372483edd..019a09665a67 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -53,7 +53,6 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
@@ -64,7 +63,6 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
@@ -72,8 +70,9 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
+#endif
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -87,6 +86,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
@@ -164,13 +164,14 @@ static cl::opt<int> LookAheadMaxDepth(
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for operand reordering scores"));
-// The Look-ahead heuristic goes through the users of the bundle to calculate
-// the users cost in getExternalUsesCost(). To avoid compilation time increase
-// we limit the number of users visited to this value.
-static cl::opt<unsigned> LookAheadUsersBudget(
- "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
- cl::desc("The maximum number of users to visit while visiting the "
- "predecessors. This prevents compilation time increase."));
+// The maximum depth that the look-ahead score heuristic will explore
+// when it probing among candidates for vectorization tree roots.
+// The higher this value, the higher the compilation time overhead but unlike
+// similar limit for operands ordering this is less frequently used, hence
+// impact of higher value is less noticeable.
+static cl::opt<int> RootLookAheadMaxDepth(
+ "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
+ cl::desc("The maximum look-ahead depth for searching best rooting option"));
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
@@ -471,17 +472,36 @@ static bool isValidForAlternation(unsigned Opcode) {
return true;
}
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0);
+
+/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
+/// compatible instructions or constants, or just some other regular values.
+static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
+ Value *Op1) {
+ return (isConstant(BaseOp0) && isConstant(Op0)) ||
+ (isConstant(BaseOp1) && isConstant(Op1)) ||
+ (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
+ !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
+ getSameOpcode({BaseOp0, Op0}).getOpcode() ||
+ getSameOpcode({BaseOp1, Op1}).getOpcode();
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
- unsigned BaseIndex = 0) {
+ unsigned BaseIndex) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
+ CmpInst::Predicate BasePred =
+ IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
+ : CmpInst::BAD_ICMP_PREDICATE;
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
@@ -514,6 +534,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
+ } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
+ auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
+ auto *Inst = cast<Instruction>(VL[Cnt]);
+ Type *Ty0 = BaseInst->getOperand(0)->getType();
+ Type *Ty1 = Inst->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ Value *BaseOp0 = BaseInst->getOperand(0);
+ Value *BaseOp1 = BaseInst->getOperand(1);
+ Value *Op0 = Inst->getOperand(0);
+ Value *Op1 = Inst->getOperand(1);
+ CmpInst::Predicate CurrentPred =
+ cast<CmpInst>(VL[Cnt])->getPredicate();
+ CmpInst::Predicate SwappedCurrentPred =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ // Check for compatible operands. If the corresponding operands are not
+ // compatible - need to perform alternate vectorization.
+ if (InstOpcode == Opcode) {
+ if (BasePred == CurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))
+ continue;
+ if (BasePred == SwappedCurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))
+ continue;
+ if (E == 2 &&
+ (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
+ continue;
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ CmpInst::Predicate AltPred = AltInst->getPredicate();
+ Value *AltOp0 = AltInst->getOperand(0);
+ Value *AltOp1 = AltInst->getOperand(1);
+ // Check if operands are compatible with alternate operands.
+ if (AltPred == CurrentPred &&
+ areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))
+ continue;
+ if (AltPred == SwappedCurrentPred &&
+ areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
+ continue;
+ }
+ if (BaseIndex == AltIndex && BasePred != CurrentPred) {
+ assert(isValidForAlternation(Opcode) &&
+ isValidForAlternation(InstOpcode) &&
+ "Cast isn't safe for alternation, logic needs to be updated!");
+ AltIndex = Cnt;
+ continue;
+ }
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ CmpInst::Predicate AltPred = AltInst->getPredicate();
+ if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
+ AltPred == CurrentPred || AltPred == SwappedCurrentPred)
+ continue;
+ }
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
@@ -570,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- if (hasVectorInstrinsicScalarOpd(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
LLVM_FALLTHROUGH;
@@ -666,11 +737,11 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
/// \returns inserting index of InsertElement or InsertValue instruction,
/// using Offset as base offset for index.
-static Optional<unsigned> getInsertIndex(Value *InsertInst,
+static Optional<unsigned> getInsertIndex(const Value *InsertInst,
unsigned Offset = 0) {
int Index = Offset;
- if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+ if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
auto *VT = cast<FixedVectorType>(IE->getType());
if (CI->getValue().uge(VT->getNumElements()))
return None;
@@ -681,13 +752,13 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst,
return None;
}
- auto *IV = cast<InsertValueInst>(InsertInst);
+ const auto *IV = cast<InsertValueInst>(InsertInst);
Type *CurrentType = IV->getType();
for (unsigned I : IV->indices()) {
- if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
Index *= ST->getNumElements();
CurrentType = ST->getElementType(I);
- } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
Index *= AT->getNumElements();
CurrentType = AT->getElementType();
} else {
@@ -698,11 +769,7 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst,
return Index;
}
-/// Reorders the list of scalars in accordance with the given \p Order and then
-/// the \p Mask. \p Order - is the original order of the scalars, need to
-/// reorder scalars into an unordered state at first according to the given
-/// order. Then the ordered scalars are shuffled once again in accordance with
-/// the provided mask.
+/// Reorders the list of scalars in accordance with the given \p Mask.
static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
ArrayRef<int> Mask) {
assert(!Mask.empty() && "Expected non-empty mask.");
@@ -714,6 +781,58 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
Scalars[Mask[I]] = Prev[I];
}
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from different blocks.
+static bool areAllOperandsNonInsts(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+ return !mayHaveNonDefUseDependency(*I) &&
+ all_of(I->operands(), [I](Value *V) {
+ auto *IO = dyn_cast<Instruction>(V);
+ if (!IO)
+ return true;
+ return isa<PHINode>(IO) || IO->getParent() != I->getParent();
+ });
+}
+
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the different blocks.
+static bool isUsedOutsideBlock(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+ // Limits the number of uses to save compile time.
+ constexpr int UsesLimit = 8;
+ return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
+ all_of(I->users(), [I](User *U) {
+ auto *IU = dyn_cast<Instruction>(U);
+ if (!IU)
+ return true;
+ return IU->getParent() != I->getParent() || isa<PHINode>(IU);
+ });
+}
+
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V) {
+ return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
+}
+
+/// Checks if the specified array of instructions does not require scheduling.
+/// It is so if all either instructions have operands that do not require
+/// scheduling or their users do not require scheduling since they are phis or
+/// in other basic blocks.
+static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
+ return !VL.empty() &&
+ (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -734,8 +853,8 @@ public:
TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
- : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
- DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+ : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
+ DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -776,7 +895,10 @@ public:
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst = None);
+ const SmallDenseSet<Value *> &UserIgnoreLst);
+
+ /// Construct a vectorizable tree that starts at \p Roots.
+ void buildTree(ArrayRef<Value *> Roots);
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
@@ -797,6 +919,7 @@ public:
}
MinBWs.clear();
InstrElementSize.clear();
+ UserIgnoreList = nullptr;
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -810,6 +933,9 @@ public:
/// ExtractElement, ExtractValue), which can be part of the graph.
Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// Sort loads into increasing pointers offsets to allow greater clustering.
+ Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
+
/// Gets reordering data for the given tree entry. If the entry is vectorized
/// - just return ReorderIndices, otherwise check if the scalars can be
/// reordered and return the most optimal order.
@@ -924,96 +1050,18 @@ public:
#endif
};
- /// A helper data structure to hold the operands of a vector of instructions.
- /// This supports a fixed vector length for all operand vectors.
- class VLOperands {
- /// For each operand we need (i) the value, and (ii) the opcode that it
- /// would be attached to if the expression was in a left-linearized form.
- /// This is required to avoid illegal operand reordering.
- /// For example:
- /// \verbatim
- /// 0 Op1
- /// |/
- /// Op1 Op2 Linearized + Op2
- /// \ / ----------> |/
- /// - -
- ///
- /// Op1 - Op2 (0 + Op1) - Op2
- /// \endverbatim
- ///
- /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
- ///
- /// Another way to think of this is to track all the operations across the
- /// path from the operand all the way to the root of the tree and to
- /// calculate the operation that corresponds to this path. For example, the
- /// path from Op2 to the root crosses the RHS of the '-', therefore the
- /// corresponding operation is a '-' (which matches the one in the
- /// linearized tree, as shown above).
- ///
- /// For lack of a better term, we refer to this operation as Accumulated
- /// Path Operation (APO).
- struct OperandData {
- OperandData() = default;
- OperandData(Value *V, bool APO, bool IsUsed)
- : V(V), APO(APO), IsUsed(IsUsed) {}
- /// The operand value.
- Value *V = nullptr;
- /// TreeEntries only allow a single opcode, or an alternate sequence of
- /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
- /// APO. It is set to 'true' if 'V' is attached to an inverse operation
- /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
- /// (e.g., Add/Mul)
- bool APO = false;
- /// Helper data for the reordering function.
- bool IsUsed = false;
- };
-
- /// During operand reordering, we are trying to select the operand at lane
- /// that matches best with the operand at the neighboring lane. Our
- /// selection is based on the type of value we are looking for. For example,
- /// if the neighboring lane has a load, we need to look for a load that is
- /// accessing a consecutive address. These strategies are summarized in the
- /// 'ReorderingMode' enumerator.
- enum class ReorderingMode {
- Load, ///< Matching loads to consecutive memory addresses
- Opcode, ///< Matching instructions based on opcode (same or alternate)
- Constant, ///< Matching constants
- Splat, ///< Matching the same instruction multiple times (broadcast)
- Failed, ///< We failed to create a vectorizable group
- };
-
- using OperandDataVec = SmallVector<OperandData, 2>;
-
- /// A vector of operand vectors.
- SmallVector<OperandDataVec, 4> OpsVec;
-
+ /// A helper class used for scoring candidates for two consecutive lanes.
+ class LookAheadHeuristics {
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
+ int NumLanes; // Total number of lanes (aka vectorization factor).
+ int MaxLevel; // The maximum recursion depth for accumulating score.
- /// \returns the operand data at \p OpIdx and \p Lane.
- OperandData &getData(unsigned OpIdx, unsigned Lane) {
- return OpsVec[OpIdx][Lane];
- }
-
- /// \returns the operand data at \p OpIdx and \p Lane. Const version.
- const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
- return OpsVec[OpIdx][Lane];
- }
-
- /// Clears the used flag for all entries.
- void clearUsed() {
- for (unsigned OpIdx = 0, NumOperands = getNumOperands();
- OpIdx != NumOperands; ++OpIdx)
- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
- ++Lane)
- OpsVec[OpIdx][Lane].IsUsed = false;
- }
-
- /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
- void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
- std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
- }
+ public:
+ LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
+ const BoUpSLP &R, int NumLanes, int MaxLevel)
+ : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
// The hard-coded scores listed here are not very important, though it shall
// be higher for better matches to improve the resulting cost. When
@@ -1028,6 +1076,11 @@ public:
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreConsecutiveLoads = 4;
+ /// The same load multiple times. This should have a better score than
+ /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
+ /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
+ /// a vector load and 1.0 for a broadcast.
+ static const int ScoreSplatLoads = 3;
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreReversedLoads = 3;
/// ExtractElementInst from same vector and consecutive indexes.
@@ -1046,43 +1099,67 @@ public:
static const int ScoreUndef = 1;
/// Score for failing to find a decent match.
static const int ScoreFail = 0;
- /// User exteranl to the vectorized code.
- static const int ExternalUseCost = 1;
- /// The user is internal but in a different lane.
- static const int UserInDiffLaneCost = ExternalUseCost;
+ /// Score if all users are vectorized.
+ static const int ScoreAllUserVectorized = 1;
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
- static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
- ScalarEvolution &SE, int NumLanes) {
- if (V1 == V2)
- return VLOperands::ScoreSplat;
+ /// \p U1 and \p U2 are the users of \p V1 and \p V2.
+ /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
+ /// MainAltOps.
+ int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
+ ArrayRef<Value *> MainAltOps) const {
+ if (V1 == V2) {
+ if (isa<LoadInst>(V1)) {
+ // Retruns true if the users of V1 and V2 won't need to be extracted.
+ auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
+ // Bail out if we have too many uses to save compilation time.
+ static constexpr unsigned Limit = 8;
+ if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
+ return false;
+
+ auto AllUsersVectorized = [U1, U2, this](Value *V) {
+ return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
+ return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
+ });
+ };
+ return AllUsersVectorized(V1) && AllUsersVectorized(V2);
+ };
+ // A broadcast of a load can be cheaper on some targets.
+ if (R.TTI->isLegalBroadcastLoad(V1->getType(),
+ ElementCount::getFixed(NumLanes)) &&
+ ((int)V1->getNumUses() == NumLanes ||
+ AllUsersAreInternal(V1, V2)))
+ return LookAheadHeuristics::ScoreSplatLoads;
+ }
+ return LookAheadHeuristics::ScoreSplat;
+ }
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent())
- return VLOperands::ScoreFail;
+ return LookAheadHeuristics::ScoreFail;
Optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
- if (!Dist)
- return VLOperands::ScoreFail;
+ if (!Dist || *Dist == 0)
+ return LookAheadHeuristics::ScoreFail;
// The distance is too large - still may be profitable to use masked
// loads/gathers.
if (std::abs(*Dist) > NumLanes / 2)
- return VLOperands::ScoreAltOpcodes;
+ return LookAheadHeuristics::ScoreAltOpcodes;
// This still will detect consecutive loads, but we might have "holes"
// in some cases. It is ok for non-power-2 vectorization and may produce
// better results. It should not affect current vectorization.
- return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
- : VLOperands::ScoreReversedLoads;
+ return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
+ : LookAheadHeuristics::ScoreReversedLoads;
}
auto *C1 = dyn_cast<Constant>(V1);
auto *C2 = dyn_cast<Constant>(V2);
if (C1 && C2)
- return VLOperands::ScoreConstants;
+ return LookAheadHeuristics::ScoreConstants;
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
@@ -1091,7 +1168,7 @@ public:
if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
// Undefs are always profitable for extractelements.
if (isa<UndefValue>(V2))
- return VLOperands::ScoreConsecutiveExtracts;
+ return LookAheadHeuristics::ScoreConsecutiveExtracts;
Value *EV2 = nullptr;
ConstantInt *Ex2Idx = nullptr;
if (match(V2,
@@ -1099,108 +1176,62 @@ public:
m_Undef())))) {
// Undefs are always profitable for extractelements.
if (!Ex2Idx)
- return VLOperands::ScoreConsecutiveExtracts;
+ return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
- return VLOperands::ScoreConsecutiveExtracts;
+ return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (EV2 == EV1) {
int Idx1 = Ex1Idx->getZExtValue();
int Idx2 = Ex2Idx->getZExtValue();
int Dist = Idx2 - Idx1;
// The distance is too large - still may be profitable to use
// shuffles.
+ if (std::abs(Dist) == 0)
+ return LookAheadHeuristics::ScoreSplat;
if (std::abs(Dist) > NumLanes / 2)
- return VLOperands::ScoreAltOpcodes;
- return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
- : VLOperands::ScoreReversedExtracts;
+ return LookAheadHeuristics::ScoreSameOpcode;
+ return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
+ : LookAheadHeuristics::ScoreReversedExtracts;
}
+ return LookAheadHeuristics::ScoreAltOpcodes;
}
+ return LookAheadHeuristics::ScoreFail;
}
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1->getParent() != I2->getParent())
- return VLOperands::ScoreFail;
- InstructionsState S = getSameOpcode({I1, I2});
+ return LookAheadHeuristics::ScoreFail;
+ SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
+ Ops.push_back(I1);
+ Ops.push_back(I2);
+ InstructionsState S = getSameOpcode(Ops);
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
- if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
- return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
- : VLOperands::ScoreSameOpcode;
+ if (S.getOpcode() &&
+ (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
+ !S.isAltShuffle()) &&
+ all_of(Ops, [&S](Value *V) {
+ return cast<Instruction>(V)->getNumOperands() ==
+ S.MainOp->getNumOperands();
+ }))
+ return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
+ : LookAheadHeuristics::ScoreSameOpcode;
}
if (isa<UndefValue>(V2))
- return VLOperands::ScoreUndef;
-
- return VLOperands::ScoreFail;
- }
-
- /// Holds the values and their lanes that are taking part in the look-ahead
- /// score calculation. This is used in the external uses cost calculation.
- /// Need to hold all the lanes in case of splat/broadcast at least to
- /// correctly check for the use in the different lane.
- SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
-
- /// \returns the additional cost due to uses of \p LHS and \p RHS that are
- /// either external to the vectorized code, or require shuffling.
- int getExternalUsesCost(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS) {
- int Cost = 0;
- std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
- for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
- Value *V = Values[Idx].first;
- if (isa<Constant>(V)) {
- // Since this is a function pass, it doesn't make semantic sense to
- // walk the users of a subclass of Constant. The users could be in
- // another function, or even another module that happens to be in
- // the same LLVMContext.
- continue;
- }
+ return LookAheadHeuristics::ScoreUndef;
- // Calculate the absolute lane, using the minimum relative lane of LHS
- // and RHS as base and Idx as the offset.
- int Ln = std::min(LHS.second, RHS.second) + Idx;
- assert(Ln >= 0 && "Bad lane calculation");
- unsigned UsersBudget = LookAheadUsersBudget;
- for (User *U : V->users()) {
- if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
- // The user is in the VectorizableTree. Check if we need to insert.
- int UserLn = UserTE->findLaneForValue(U);
- assert(UserLn >= 0 && "Bad lane");
- // If the values are different, check just the line of the current
- // value. If the values are the same, need to add UserInDiffLaneCost
- // only if UserLn does not match both line numbers.
- if ((LHS.first != RHS.first && UserLn != Ln) ||
- (LHS.first == RHS.first && UserLn != LHS.second &&
- UserLn != RHS.second)) {
- Cost += UserInDiffLaneCost;
- break;
- }
- } else {
- // Check if the user is in the look-ahead code.
- auto It2 = InLookAheadValues.find(U);
- if (It2 != InLookAheadValues.end()) {
- // The user is in the look-ahead code. Check the lane.
- if (!It2->getSecond().contains(Ln)) {
- Cost += UserInDiffLaneCost;
- break;
- }
- } else {
- // The user is neither in SLP tree nor in the look-ahead code.
- Cost += ExternalUseCost;
- break;
- }
- }
- // Limit the number of visited uses to cap compilation time.
- if (--UsersBudget == 0)
- break;
- }
- }
- return Cost;
+ return LookAheadHeuristics::ScoreFail;
}
- /// Go through the operands of \p LHS and \p RHS recursively until \p
- /// MaxLevel, and return the cummulative score. For example:
+ /// Go through the operands of \p LHS and \p RHS recursively until
+ /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
+ /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
+ /// of \p U1 and \p U2), except at the beginning of the recursion where
+ /// these are set to nullptr.
+ ///
+ /// For example:
/// \verbatim
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
/// \ / \ / \ / \ /
@@ -1211,8 +1242,8 @@ public:
/// each level recursively, accumulating the score. It starts from matching
/// the additions at level 0, then moves on to the loads (level 1). The
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
- /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
- /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+ /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
+ /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
/// Please note that the order of the operands does not matter, as we
/// evaluate the score of all profitable combinations of operands. In
/// other words the score of G1 and G4 is the same as G1 and G2. This
@@ -1220,18 +1251,13 @@ public:
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
/// Luís F. W. Góes
- int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS, int CurrLevel,
- int MaxLevel) {
+ int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
+ Instruction *U2, int CurrLevel,
+ ArrayRef<Value *> MainAltOps) const {
- Value *V1 = LHS.first;
- Value *V2 = RHS.first;
// Get the shallow score of V1 and V2.
- int ShallowScoreAtThisLevel = std::max(
- (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
- getExternalUsesCost(LHS, RHS));
- int Lane1 = LHS.second;
- int Lane2 = RHS.second;
+ int ShallowScoreAtThisLevel =
+ getShallowScore(LHS, RHS, U1, U2, MainAltOps);
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
@@ -1239,20 +1265,17 @@ public:
// or if they are not consecutive,
// or if profitable to vectorize loads or extractelements, early return
// the current cost.
- auto *I1 = dyn_cast<Instruction>(V1);
- auto *I2 = dyn_cast<Instruction>(V2);
+ auto *I1 = dyn_cast<Instruction>(LHS);
+ auto *I2 = dyn_cast<Instruction>(RHS);
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
- ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+ ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+ (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
(isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
ShallowScoreAtThisLevel))
return ShallowScoreAtThisLevel;
assert(I1 && I2 && "Should have early exited.");
- // Keep track of in-tree values for determining the external-use cost.
- InLookAheadValues[V1].insert(Lane1);
- InLookAheadValues[V2].insert(Lane2);
-
// Contains the I2 operand indexes that got matched with I1 operands.
SmallSet<unsigned, 4> Op2Used;
@@ -1275,11 +1298,12 @@ public:
if (Op2Used.count(OpIdx2))
continue;
// Recursively calculate the cost at each level
- int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
- {I2->getOperand(OpIdx2), Lane2},
- CurrLevel + 1, MaxLevel);
+ int TmpScore =
+ getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
+ I1, I2, CurrLevel + 1, None);
// Look for the best score.
- if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+ if (TmpScore > LookAheadHeuristics::ScoreFail &&
+ TmpScore > MaxTmpScore) {
MaxTmpScore = TmpScore;
MaxOpIdx2 = OpIdx2;
FoundBest = true;
@@ -1293,24 +1317,213 @@ public:
}
return ShallowScoreAtThisLevel;
}
+ };
+ /// A helper data structure to hold the operands of a vector of instructions.
+ /// This supports a fixed vector length for all operand vectors.
+ class VLOperands {
+ /// For each operand we need (i) the value, and (ii) the opcode that it
+ /// would be attached to if the expression was in a left-linearized form.
+ /// This is required to avoid illegal operand reordering.
+ /// For example:
+ /// \verbatim
+ /// 0 Op1
+ /// |/
+ /// Op1 Op2 Linearized + Op2
+ /// \ / ----------> |/
+ /// - -
+ ///
+ /// Op1 - Op2 (0 + Op1) - Op2
+ /// \endverbatim
+ ///
+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+ ///
+ /// Another way to think of this is to track all the operations across the
+ /// path from the operand all the way to the root of the tree and to
+ /// calculate the operation that corresponds to this path. For example, the
+ /// path from Op2 to the root crosses the RHS of the '-', therefore the
+ /// corresponding operation is a '-' (which matches the one in the
+ /// linearized tree, as shown above).
+ ///
+ /// For lack of a better term, we refer to this operation as Accumulated
+ /// Path Operation (APO).
+ struct OperandData {
+ OperandData() = default;
+ OperandData(Value *V, bool APO, bool IsUsed)
+ : V(V), APO(APO), IsUsed(IsUsed) {}
+ /// The operand value.
+ Value *V = nullptr;
+ /// TreeEntries only allow a single opcode, or an alternate sequence of
+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+ /// (e.g., Add/Mul)
+ bool APO = false;
+ /// Helper data for the reordering function.
+ bool IsUsed = false;
+ };
+
+ /// During operand reordering, we are trying to select the operand at lane
+ /// that matches best with the operand at the neighboring lane. Our
+ /// selection is based on the type of value we are looking for. For example,
+ /// if the neighboring lane has a load, we need to look for a load that is
+ /// accessing a consecutive address. These strategies are summarized in the
+ /// 'ReorderingMode' enumerator.
+ enum class ReorderingMode {
+ Load, ///< Matching loads to consecutive memory addresses
+ Opcode, ///< Matching instructions based on opcode (same or alternate)
+ Constant, ///< Matching constants
+ Splat, ///< Matching the same instruction multiple times (broadcast)
+ Failed, ///< We failed to create a vectorizable group
+ };
+
+ using OperandDataVec = SmallVector<OperandData, 2>;
+
+ /// A vector of operand vectors.
+ SmallVector<OperandDataVec, 4> OpsVec;
+
+ const DataLayout &DL;
+ ScalarEvolution &SE;
+ const BoUpSLP &R;
+
+ /// \returns the operand data at \p OpIdx and \p Lane.
+ OperandData &getData(unsigned OpIdx, unsigned Lane) {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// Clears the used flag for all entries.
+ void clearUsed() {
+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane)
+ OpsVec[OpIdx][Lane].IsUsed = false;
+ }
+
+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+ }
+
+ /// \param Lane lane of the operands under analysis.
+ /// \param OpIdx operand index in \p Lane lane we're looking the best
+ /// candidate for.
+ /// \param Idx operand index of the current candidate value.
+ /// \returns The additional score due to possible broadcasting of the
+ /// elements in the lane. It is more profitable to have power-of-2 unique
+ /// elements in the lane, it will be vectorized with higher probability
+ /// after removing duplicates. Currently the SLP vectorizer supports only
+ /// vectorization of the power-of-2 number of unique scalars.
+ int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+ Value *IdxLaneV = getData(Idx, Lane).V;
+ if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
+ return 0;
+ SmallPtrSet<Value *, 4> Uniques;
+ for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
+ if (Ln == Lane)
+ continue;
+ Value *OpIdxLnV = getData(OpIdx, Ln).V;
+ if (!isa<Instruction>(OpIdxLnV))
+ return 0;
+ Uniques.insert(OpIdxLnV);
+ }
+ int UniquesCount = Uniques.size();
+ int UniquesCntWithIdxLaneV =
+ Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
+ Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+ int UniquesCntWithOpIdxLaneV =
+ Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
+ if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
+ return 0;
+ return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
+ UniquesCntWithOpIdxLaneV) -
+ (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
+ }
+
+ /// \param Lane lane of the operands under analysis.
+ /// \param OpIdx operand index in \p Lane lane we're looking the best
+ /// candidate for.
+ /// \param Idx operand index of the current candidate value.
+ /// \returns The additional score for the scalar which users are all
+ /// vectorized.
+ int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+ Value *IdxLaneV = getData(Idx, Lane).V;
+ Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+ // Do not care about number of uses for vector-like instructions
+ // (extractelement/extractvalue with constant indices), they are extracts
+ // themselves and already externally used. Vectorization of such
+ // instructions does not add extra extractelement instruction, just may
+ // remove it.
+ if (isVectorLikeInstWithConstOps(IdxLaneV) &&
+ isVectorLikeInstWithConstOps(OpIdxLaneV))
+ return LookAheadHeuristics::ScoreAllUserVectorized;
+ auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
+ if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
+ return 0;
+ return R.areAllUsersVectorized(IdxLaneI, None)
+ ? LookAheadHeuristics::ScoreAllUserVectorized
+ : 0;
+ }
+
+ /// Score scaling factor for fully compatible instructions but with
+ /// different number of external uses. Allows better selection of the
+ /// instructions with less external uses.
+ static const int ScoreScaleFactor = 10;
/// \Returns the look-ahead score, which tells us how much the sub-trees
/// rooted at \p LHS and \p RHS match, the more they match the higher the
/// score. This helps break ties in an informed way when we cannot decide on
/// the order of the operands by just considering the immediate
/// predecessors.
- int getLookAheadScore(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS) {
- InLookAheadValues.clear();
- return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+ int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
+ int Lane, unsigned OpIdx, unsigned Idx,
+ bool &IsUsed) {
+ LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
+ LookAheadMaxDepth);
+ // Keep track of the instruction stack as we recurse into the operands
+ // during the look-ahead score exploration.
+ int Score =
+ LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
+ /*CurrLevel=*/1, MainAltOps);
+ if (Score) {
+ int SplatScore = getSplatScore(Lane, OpIdx, Idx);
+ if (Score <= -SplatScore) {
+ // Set the minimum score for splat-like sequence to avoid setting
+ // failed state.
+ Score = 1;
+ } else {
+ Score += SplatScore;
+ // Scale score to see the difference between different operands
+ // and similar operands but all vectorized/not all vectorized
+ // uses. It does not affect actual selection of the best
+ // compatible operand in general, just allows to select the
+ // operand with all vectorized uses.
+ Score *= ScoreScaleFactor;
+ Score += getExternalUseScore(Lane, OpIdx, Idx);
+ IsUsed = true;
+ }
+ }
+ return Score;
}
+ /// Best defined scores per lanes between the passes. Used to choose the
+ /// best operand (with the highest score) between the passes.
+ /// The key - {Operand Index, Lane}.
+ /// The value - the best score between the passes for the lane and the
+ /// operand.
+ SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
+ BestScoresPerLanes;
+
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return None.
- Optional<unsigned>
- getBestOperand(unsigned OpIdx, int Lane, int LastLane,
- ArrayRef<ReorderingMode> ReorderingModes) {
+ Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes,
+ ArrayRef<Value *> MainAltOps) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
@@ -1318,6 +1531,8 @@ public:
// Our strategy mode for OpIdx.
ReorderingMode RMode = ReorderingModes[OpIdx];
+ if (RMode == ReorderingMode::Failed)
+ return None;
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
@@ -1329,7 +1544,15 @@ public:
Optional<unsigned> Idx = None;
unsigned Score = 0;
} BestOp;
-
+ BestOp.Score =
+ BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
+ .first->second;
+
+ // Track if the operand must be marked as used. If the operand is set to
+ // Score 1 explicitly (because of non power-of-2 unique scalars, we may
+ // want to reestimate the operands again on the following iterations).
+ bool IsUsed =
+ RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
@@ -1355,11 +1578,12 @@ public:
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
- unsigned Score =
- getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
- if (Score > BestOp.Score) {
+ int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
+ OpIdx, Idx, IsUsed);
+ if (Score > static_cast<int>(BestOp.Score)) {
BestOp.Idx = Idx;
BestOp.Score = Score;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
}
break;
}
@@ -1368,12 +1592,12 @@ public:
BestOp.Idx = Idx;
break;
case ReorderingMode::Failed:
- return None;
+ llvm_unreachable("Not expected Failed reordering mode.");
}
}
if (BestOp.Idx) {
- getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+ getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
return BestOp.Idx;
}
// If we could not find a good match return None.
@@ -1690,6 +1914,10 @@ public:
// rest of the lanes. We are visiting the nodes in a circular fashion,
// using FirstLane as the center point and increasing the radius
// distance.
+ SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
+ for (unsigned I = 0; I < NumOperands; ++I)
+ MainAltOps[I].push_back(getData(I, FirstLane).V);
+
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {
@@ -1702,21 +1930,29 @@ public:
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
- Optional<unsigned> BestIdx =
- getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+ Optional<unsigned> BestIdx = getBestOperand(
+ OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
// the next run of getBestOperand().
if (BestIdx) {
// Swap the current operand with the one returned by
// getBestOperand().
- swap(OpIdx, BestIdx.getValue(), Lane);
+ swap(OpIdx, *BestIdx, Lane);
} else {
// We failed to find a best operand, set mode to 'Failed'.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
}
+ // Try to get the alternate opcode and follow it during analysis.
+ if (MainAltOps[OpIdx].size() != 2) {
+ OperandData &AltOp = getData(OpIdx, Lane);
+ InstructionsState OpS =
+ getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});
+ if (OpS.getOpcode() && OpS.isAltShuffle())
+ MainAltOps[OpIdx].push_back(AltOp.V);
+ }
}
}
}
@@ -1780,15 +2016,109 @@ public:
#endif
};
+ /// Evaluate each pair in \p Candidates and return index into \p Candidates
+ /// for a pair which have highest score deemed to have best chance to form
+ /// root of profitable tree to vectorize. Return None if no candidate scored
+ /// above the LookAheadHeuristics::ScoreFail.
+ /// \param Limit Lower limit of the cost, considered to be good enough score.
+ Optional<int>
+ findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
+ int Limit = LookAheadHeuristics::ScoreFail) {
+ LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+ RootLookAheadMaxDepth);
+ int BestScore = Limit;
+ Optional<int> Index = None;
+ for (int I : seq<int>(0, Candidates.size())) {
+ int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
+ Candidates[I].second,
+ /*U1=*/nullptr, /*U2=*/nullptr,
+ /*Level=*/1, None);
+ if (Score > BestScore) {
+ BestScore = Score;
+ Index = I;
+ }
+ }
+ return Index;
+ }
+
/// Checks if the instruction is marked for deletion.
bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
- /// Marks values operands for later deletion by replacing them with Undefs.
- void eraseInstructions(ArrayRef<Value *> AV);
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ void eraseInstruction(Instruction *I) {
+ DeletedInstructions.insert(I);
+ }
+
+ /// Checks if the instruction was already analyzed for being possible
+ /// reduction root.
+ bool isAnalyzedReductionRoot(Instruction *I) const {
+ return AnalyzedReductionsRoots.count(I);
+ }
+ /// Register given instruction as already analyzed for being possible
+ /// reduction root.
+ void analyzedReductionRoot(Instruction *I) {
+ AnalyzedReductionsRoots.insert(I);
+ }
+ /// Checks if the provided list of reduced values was checked already for
+ /// vectorization.
+ bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+ return AnalyzedReductionVals.contains(hash_value(VL));
+ }
+ /// Adds the list of reduced values to list of already checked values for the
+ /// vectorization.
+ void analyzedReductionVals(ArrayRef<Value *> VL) {
+ AnalyzedReductionVals.insert(hash_value(VL));
+ }
+ /// Clear the list of the analyzed reduction root instructions.
+ void clearReductionData() {
+ AnalyzedReductionsRoots.clear();
+ AnalyzedReductionVals.clear();
+ }
+ /// Checks if the given value is gathered in one of the nodes.
+ bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
+ return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
+ }
~BoUpSLP();
private:
+ /// Check if the operands on the edges \p Edges of the \p UserTE allows
+ /// reordering (i.e. the operands can be reordered because they have only one
+ /// user and reordarable).
+ /// \param ReorderableGathers List of all gather nodes that require reordering
+ /// (e.g., gather of extractlements or partially vectorizable loads).
+ /// \param GatherOps List of gather operand nodes for \p UserTE that require
+ /// reordering, subset of \p NonVectorized.
+ bool
+ canReorderOperands(TreeEntry *UserTE,
+ SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+ ArrayRef<TreeEntry *> ReorderableGathers,
+ SmallVectorImpl<TreeEntry *> &GatherOps);
+
+ /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+ /// if any. If it is not vectorized (gather node), returns nullptr.
+ TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
+ ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
+ TreeEntry *TE = nullptr;
+ const auto *It = find_if(VL, [this, &TE](Value *V) {
+ TE = getTreeEntry(V);
+ return TE;
+ });
+ if (It != VL.end() && TE->isSame(VL))
+ return TE;
+ return nullptr;
+ }
+
+ /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+ /// if any. If it is not vectorized (gather node), returns nullptr.
+ const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
+ unsigned OpIdx) const {
+ return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
+ const_cast<TreeEntry *>(UserTE), OpIdx);
+ }
+
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const;
@@ -1815,12 +2145,17 @@ private:
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
+ /// Create a new vector from a list of scalar values. Produces a sequence
+ /// which exploits values reused across lanes, and arranges the inserts
+ /// for ease of later optimization.
+ Value *createBuildVector(ArrayRef<Value *> VL);
+
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars. If \p
/// NeedToShuffle is true, need to add a cost of reshuffling some of the
/// vector elements.
InstructionCost getGatherCost(FixedVectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices,
+ const APInt &ShuffledIndices,
bool NeedToShuffle) const;
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
@@ -1855,6 +2190,29 @@ private:
const DataLayout &DL,
ScalarEvolution &SE,
const BoUpSLP &R);
+
+ /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
+ /// users of \p TE and collects the stores. It returns the map from the store
+ /// pointers to the collected stores.
+ DenseMap<Value *, SmallVector<StoreInst *, 4>>
+ collectUserStores(const BoUpSLP::TreeEntry *TE) const;
+
+ /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
+ /// stores in \p StoresVec can for a vector instruction. If so it returns true
+ /// and populates \p ReorderIndices with the shuffle indices of the the stores
+ /// when compared to the sorted vector.
+ bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ OrdersType &ReorderIndices) const;
+
+ /// Iterates through the users of \p TE, looking for scalar stores that can be
+ /// potentially vectorized in a future SLP-tree. If found, it keeps track of
+ /// their order and builds an order index vector for each store bundle. It
+ /// returns all these order vectors found.
+ /// We run this after the tree has formed, otherwise we may come across user
+ /// instructions that are not yet in the tree.
+ SmallVector<OrdersType, 1>
+ findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
+
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -2199,15 +2557,21 @@ private:
ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
- unsigned Lane = 0;
- for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
- BundleMember = BundleMember->NextInBundle) {
- BundleMember->TE = Last;
- BundleMember->Lane = Lane;
- ++Lane;
- }
- assert((!Bundle.getValue() || Lane == VL.size()) &&
+ ScheduleData *BundleMember = *Bundle;
+ assert((BundleMember || isa<PHINode>(S.MainOp) ||
+ isVectorLikeInstWithConstOps(S.MainOp) ||
+ doesNotNeedToSchedule(VL)) &&
"Bundle and VL out of sync");
+ if (BundleMember) {
+ for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
+ assert(BundleMember && "Unexpected end of bundle.");
+ BundleMember->TE = Last;
+ BundleMember = BundleMember->NextInBundle;
+ }
+ }
+ assert(!BundleMember && "Bundle and VL out of sync");
} else {
MustGather.insert(VL.begin(), VL.end());
}
@@ -2241,7 +2605,7 @@ private:
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
- /// Maps a value to the proposed vectorizable size.
+ /// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
/// A list of scalars that we found that we need to keep as scalars.
@@ -2272,12 +2636,12 @@ private:
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
- if (result.hasValue()) {
+ if (result) {
return result.getValue();
}
bool aliased = true;
if (Loc1.Ptr && isSimple(Inst1))
- aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
+ aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
// Store the result in the cache.
result = aliased;
return aliased;
@@ -2289,20 +2653,23 @@ private:
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
- /// Removes an instruction from its block and eventually deletes it.
- /// It's like Instruction::eraseFromParent() except that the actual deletion
- /// is delayed until BoUpSLP is destructed.
- /// This is required to ensure that there are no incorrect collisions in the
- /// AliasCache, which can happen if a new instruction is allocated at the
- /// same address as a previously deleted instruction.
- void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
- auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
- It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
- }
+ // Cache for pointerMayBeCaptured calls inside AA. This is preserved
+ // globally through SLP because we don't perform any action which
+ // invalidates capture results.
+ BatchAAResults BatchAA;
/// Temporary store for deleted instructions. Instructions will be deleted
- /// eventually when the BoUpSLP is destructed.
- DenseMap<Instruction *, bool> DeletedInstructions;
+ /// eventually when the BoUpSLP is destructed. The deferral is required to
+ /// ensure that there are no incorrect collisions in the AliasCache, which
+ /// can happen if a new instruction is allocated at the same address as a
+ /// previously deleted instruction.
+ DenseSet<Instruction *> DeletedInstructions;
+
+ /// Set of the instruction, being analyzed already for reductions.
+ SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
+
+ /// Set of hashes for the list of reduction values already being analyzed.
+ DenseSet<size_t> AnalyzedReductionVals;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
@@ -2336,14 +2703,39 @@ private:
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
- UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
OpValue = OpVal;
TE = nullptr;
- Lane = -1;
+ }
+
+ /// Verify basic self consistency properties
+ void verify() {
+ if (hasValidDependencies()) {
+ assert(UnscheduledDeps <= Dependencies && "invariant");
+ } else {
+ assert(UnscheduledDeps == Dependencies && "invariant");
+ }
+
+ if (IsScheduled) {
+ assert(isSchedulingEntity() &&
+ "unexpected scheduled state");
+ for (const ScheduleData *BundleMember = this; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ assert(BundleMember->hasValidDependencies() &&
+ BundleMember->UnscheduledDeps == 0 &&
+ "unexpected scheduled state");
+ assert((BundleMember == this || !BundleMember->IsScheduled) &&
+ "only bundle is marked scheduled");
+ }
+ }
+
+ assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
+ "all bundle members must be in same basic block");
}
/// Returns true if the dependency information has been calculated.
+ /// Note that depenendency validity can vary between instructions within
+ /// a single bundle.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
@@ -2353,7 +2745,7 @@ private:
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
- return NextInBundle != nullptr || FirstInBundle != this;
+ return NextInBundle != nullptr || FirstInBundle != this || TE;
}
/// Returns true if it is ready for scheduling, i.e. it has no more
@@ -2361,20 +2753,23 @@ private:
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
- return UnscheduledDepsInBundle == 0 && !IsScheduled;
+ return unscheduledDepsInBundle() == 0 && !IsScheduled;
}
- /// Modifies the number of unscheduled dependencies, also updating it for
- /// the whole bundle.
+ /// Modifies the number of unscheduled dependencies for this instruction,
+ /// and returns the number of remaining dependencies for the containing
+ /// bundle.
int incrementUnscheduledDeps(int Incr) {
+ assert(hasValidDependencies() &&
+ "increment of unscheduled deps would be meaningless");
UnscheduledDeps += Incr;
- return FirstInBundle->UnscheduledDepsInBundle += Incr;
+ return FirstInBundle->unscheduledDepsInBundle();
}
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
- incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+ UnscheduledDeps = Dependencies;
}
/// Clears all dependency information.
@@ -2382,6 +2777,19 @@ private:
Dependencies = InvalidDeps;
resetUnscheduledDeps();
MemoryDependencies.clear();
+ ControlDependencies.clear();
+ }
+
+ int unscheduledDepsInBundle() const {
+ assert(isSchedulingEntity() && "only meaningful on the bundle");
+ int Sum = 0;
+ for (const ScheduleData *BundleMember = this; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ if (BundleMember->UnscheduledDeps == InvalidDeps)
+ return InvalidDeps;
+ Sum += BundleMember->UnscheduledDeps;
+ }
+ return Sum;
}
void dump(raw_ostream &os) const {
@@ -2402,6 +2810,12 @@ private:
Instruction *Inst = nullptr;
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
+
+ /// The TreeEntry that this instruction corresponds to.
+ TreeEntry *TE = nullptr;
+
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle = nullptr;
@@ -2418,6 +2832,12 @@ private:
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
+ /// List of instructions which this instruction could be control dependent
+ /// on. Allowing such nodes to be scheduled below this one could introduce
+ /// a runtime fault which didn't exist in the original program.
+ /// ex: this is a load or udiv following a readonly call which inf loops
+ SmallVector<ScheduleData *, 4> ControlDependencies;
+
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID = 0;
@@ -2437,22 +2857,9 @@ private:
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps = InvalidDeps;
- /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
- /// single instructions.
- int UnscheduledDepsInBundle = InvalidDeps;
-
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
-
- /// Opcode of the current instruction in the schedule data.
- Value *OpValue = nullptr;
-
- /// The TreeEntry that this instruction corresponds to.
- TreeEntry *TE = nullptr;
-
- /// The lane of this node in the TreeEntry.
- int Lane = -1;
};
#ifndef NDEBUG
@@ -2467,6 +2874,21 @@ private:
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
+ /// It does not schedules instructions, which are not memory read/write
+ /// instructions and their operands are either constants, or arguments, or
+ /// phis, or instructions from others blocks, or their users are phis or from
+ /// the other blocks. The resulting vector instructions can be placed at the
+ /// beginning of the basic block without scheduling (if operands does not need
+ /// to be scheduled) or at the end of the block (if users are outside of the
+ /// block). It allows to save some compile time and memory used by the
+ /// compiler.
+ /// ScheduleData is assigned for each instruction in between the boundaries of
+ /// the tree entry, even for those, which are not part of the graph. It is
+ /// required to correctly follow the dependencies between the instructions and
+ /// their correct scheduling. The ScheduleData is not allocated for the
+ /// instructions, which do not require scheduling, like phis, nodes with
+ /// extractelements/insertelements only or nodes with instructions, with
+ /// uses/operands outside of the block.
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
@@ -2477,6 +2899,7 @@ private:
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
+ RegionHasStackSave = false;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
@@ -2490,20 +2913,29 @@ private:
++SchedulingRegionID;
}
- ScheduleData *getScheduleData(Value *V) {
- ScheduleData *SD = ScheduleDataMap[V];
- if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ ScheduleData *getScheduleData(Instruction *I) {
+ if (BB != I->getParent())
+ // Avoid lookup if can't possibly be in map.
+ return nullptr;
+ ScheduleData *SD = ScheduleDataMap.lookup(I);
+ if (SD && isInSchedulingRegion(SD))
return SD;
return nullptr;
}
+ ScheduleData *getScheduleData(Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ return getScheduleData(I);
+ return nullptr;
+ }
+
ScheduleData *getScheduleData(Value *V, Value *Key) {
if (V == Key)
return getScheduleData(V);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end()) {
- ScheduleData *SD = I->second[Key];
- if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ ScheduleData *SD = I->second.lookup(Key);
+ if (SD && isInSchedulingRegion(SD))
return SD;
}
return nullptr;
@@ -2524,7 +2956,7 @@ private:
BundleMember = BundleMember->NextInBundle) {
if (BundleMember->Inst != BundleMember->OpValue)
continue;
-
+
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
@@ -2546,10 +2978,12 @@ private:
};
// If BundleMember is a vector bundle, its operands may have been
- // reordered duiring buildTree(). We therefore need to get its operands
+ // reordered during buildTree(). We therefore need to get its operands
// through the TreeEntry.
if (TreeEntry *TE = BundleMember->TE) {
- int Lane = BundleMember->Lane;
+ // Need to search for the lane since the tree entry can be reordered.
+ int Lane = std::distance(TE->Scalars.begin(),
+ find(TE->Scalars, BundleMember->Inst));
assert(Lane >= 0 && "Lane not set");
// Since vectorization tree is being built recursively this assertion
@@ -2558,7 +2992,7 @@ private:
// where their second (immediate) operand is not added. Since
// immediates do not affect scheduler behavior this is considered
// okay.
- auto *In = TE->getMainOp();
+ auto *In = BundleMember->Inst;
assert(In &&
(isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
In->getNumOperands() == TE->getNumOperands()) &&
@@ -2578,7 +3012,8 @@ private:
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
- if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+ if (MemoryDepSD->hasValidDependencies() &&
+ MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
@@ -2589,6 +3024,48 @@ private:
<< "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
+ // Handle the control dependencies.
+ for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
+ if (DepSD->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = DepSD->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (ctl): " << *DepBundle << "\n");
+ }
+ }
+
+ }
+ }
+
+ /// Verify basic self consistency properties of the data structure.
+ void verify() {
+ if (!ScheduleStart)
+ return;
+
+ assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
+ ScheduleStart->comesBefore(ScheduleEnd) &&
+ "Not a valid scheduling region?");
+
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ auto *SD = getScheduleData(I);
+ if (!SD)
+ continue;
+ assert(isInSchedulingRegion(SD) &&
+ "primary schedule data not in window?");
+ assert(isInSchedulingRegion(SD->FirstInBundle) &&
+ "entire bundle in window!");
+ (void)SD;
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
+ }
+
+ for (auto *SD : ReadyInsts) {
+ assert(SD->isSchedulingEntity() && SD->isReady() &&
+ "item in ready list not ready?");
+ (void)SD;
}
}
@@ -2599,7 +3076,7 @@ private:
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end())
for (auto &P : I->second)
- if (P.second->SchedulingRegionID == SchedulingRegionID)
+ if (isInSchedulingRegion(P.second))
Action(P.second);
}
@@ -2608,10 +3085,11 @@ private:
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
- if (SD->isSchedulingEntity() && SD->isReady()) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+ SD->isReady()) {
ReadyList.insert(SD);
LLVM_DEBUG(dbgs()
- << "SLP: initially in ready list: " << *I << "\n");
+ << "SLP: initially in ready list: " << *SD << "\n");
}
});
}
@@ -2669,18 +3147,14 @@ private:
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
- DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+ DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
/// Attaches ScheduleData to Instruction with the leading key.
DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
ExtraScheduleDataMap;
- struct ReadyList : SmallVector<ScheduleData *, 8> {
- void insert(ScheduleData *SD) { push_back(SD); }
- };
-
/// The ready-list for scheduling (only used for the dry-run).
- ReadyList ReadyInsts;
+ SetVector<ScheduleData *> ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart = nullptr;
@@ -2696,6 +3170,11 @@ private:
/// (can be null).
ScheduleData *LastLoadStoreInRegion = nullptr;
+ /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
+ /// region? Used to optimize the dependence calculation for the
+ /// common case where there isn't.
+ bool RegionHasStackSave = false;
+
/// The current size of the scheduling region.
int ScheduleRegionSize = 0;
@@ -2704,8 +3183,8 @@ private:
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
- // Make sure that the initial SchedulingRegionID is greater than the
- // initial SchedulingRegionID in ScheduleData (which is 0).
+ /// Make sure that the initial SchedulingRegionID is greater than the
+ /// initial SchedulingRegionID in ScheduleData (which is 0).
int SchedulingRegionID = 1;
};
@@ -2717,7 +3196,7 @@ private:
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
- ArrayRef<Value *> UserIgnoreList;
+ const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
@@ -2748,7 +3227,6 @@ private:
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
- AAResults *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
@@ -2865,20 +3343,25 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
} // end namespace llvm
BoUpSLP::~BoUpSLP() {
- for (const auto &Pair : DeletedInstructions) {
- // Replace operands of ignored instructions with Undefs in case if they were
- // marked for deletion.
- if (Pair.getSecond()) {
- Value *Undef = UndefValue::get(Pair.getFirst()->getType());
- Pair.getFirst()->replaceAllUsesWith(Undef);
- }
- Pair.getFirst()->dropAllReferences();
- }
- for (const auto &Pair : DeletedInstructions) {
- assert(Pair.getFirst()->use_empty() &&
+ SmallVector<WeakTrackingVH> DeadInsts;
+ for (auto *I : DeletedInstructions) {
+ for (Use &U : I->operands()) {
+ auto *Op = dyn_cast<Instruction>(U.get());
+ if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
+ wouldInstructionBeTriviallyDead(Op, TLI))
+ DeadInsts.emplace_back(Op);
+ }
+ I->dropAllReferences();
+ }
+ for (auto *I : DeletedInstructions) {
+ assert(I->use_empty() &&
"trying to erase instruction with users.");
- Pair.getFirst()->eraseFromParent();
+ I->eraseFromParent();
}
+
+ // Cleanup any dead scalar code feeding the vectorized instructions
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
+
#ifdef EXPENSIVE_CHECKS
// If we could guarantee that this call is not extremely slow, we could
// remove the ifdef limitation (see PR47712).
@@ -2886,13 +3369,6 @@ BoUpSLP::~BoUpSLP() {
#endif
}
-void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
- for (auto *V : AV) {
- if (auto *I = dyn_cast<Instruction>(V))
- eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
- };
-}
-
/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
/// contains original mask for the scalars reused in the node. Procedure
/// transform this mask in accordance with the given \p Mask.
@@ -2997,6 +3473,189 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
return None;
}
+namespace {
+/// Tracks the state we can represent the loads in the given sequence.
+enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+} // anonymous namespace
+
+/// Checks if the given array of loads can be represented as a vectorized,
+/// scatter or just simple gather.
+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ LoopInfo &LI,
+ SmallVectorImpl<unsigned> &Order,
+ SmallVectorImpl<Value *> &PointerOps) {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ Type *ScalarTy = VL0->getType();
+
+ if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
+ return LoadsState::Gather;
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
+ PointerOps.clear();
+ PointerOps.resize(VL.size());
+ auto *POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
+ if (!L->isSimple())
+ return LoadsState::Gather;
+ *POIter = L->getPointerOperand();
+ ++POIter;
+ }
+
+ Order.clear();
+ // Check the order of pointer operands or that all pointers are the same.
+ bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
+ if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {
+ if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))
+ return false;
+ auto *GEP = dyn_cast<GetElementPtrInst>(P);
+ if (!GEP)
+ return false;
+ auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());
+ return GEP->getNumOperands() == 2 &&
+ ((isConstant(GEP->getOperand(1)) &&
+ isConstant(GEP0->getOperand(1))) ||
+ getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})
+ .getOpcode());
+ })) {
+ if (IsSorted) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ Optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ // Check that the sorted loads are consecutive.
+ if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+ return LoadsState::Vectorize;
+ }
+ // TODO: need to improve analysis of the pointers, if not all of them are
+ // GEPs or have > 2 operands, we end up with a gather node, which just
+ // increases the cost.
+ Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
+ bool ProfitableGatherPointers =
+ static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+ return L && L->isLoopInvariant(V);
+ })) <= VL.size() / 2 && VL.size() > 2;
+ if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(P);
+ return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
+ (GEP && GEP->getNumOperands() == 2);
+ })) {
+ Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+ for (Value *V : VL)
+ CommonAlignment =
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
+ !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
+ return LoadsState::ScatterVectorize;
+ }
+ }
+
+ return LoadsState::Gather;
+}
+
+bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices) {
+ assert(llvm::all_of(
+ VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
+ "Expected list of pointer operands.");
+ // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
+ // Ptr into, sort and return the sorted indices with values next to one
+ // another.
+ MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
+ Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
+
+ unsigned Cnt = 1;
+ for (Value *Ptr : VL.drop_front()) {
+ bool Found = any_of(Bases, [&](auto &Base) {
+ Optional<int> Diff =
+ getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
+ /*StrictCheck=*/true);
+ if (!Diff)
+ return false;
+
+ Base.second.emplace_back(Ptr, *Diff, Cnt++);
+ return true;
+ });
+
+ if (!Found) {
+ // If we haven't found enough to usefully cluster, return early.
+ if (Bases.size() > VL.size() / 2 - 1)
+ return false;
+
+ // Not found already - add a new Base
+ Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
+ }
+ }
+
+ // For each of the bases sort the pointers by Offset and check if any of the
+ // base become consecutively allocated.
+ bool AnyConsecutive = false;
+ for (auto &Base : Bases) {
+ auto &Vec = Base.second;
+ if (Vec.size() > 1) {
+ llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
+ const std::tuple<Value *, int, unsigned> &Y) {
+ return std::get<1>(X) < std::get<1>(Y);
+ });
+ int InitialOffset = std::get<1>(Vec[0]);
+ AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {
+ return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
+ });
+ }
+ }
+
+ // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
+ SortedIndices.clear();
+ if (!AnyConsecutive)
+ return false;
+
+ for (auto &Base : Bases) {
+ for (auto &T : Base.second)
+ SortedIndices.push_back(std::get<2>(T));
+ }
+
+ assert(SortedIndices.size() == VL.size() &&
+ "Expected SortedIndices to be the size of VL");
+ return true;
+}
+
+Optional<BoUpSLP::OrdersType>
+BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
+ assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
+ Type *ScalarTy = TE.Scalars[0]->getType();
+
+ SmallVector<Value *> Ptrs;
+ Ptrs.reserve(TE.Scalars.size());
+ for (Value *V : TE.Scalars) {
+ auto *L = dyn_cast<LoadInst>(V);
+ if (!L || !L->isSimple())
+ return None;
+ Ptrs.push_back(L->getPointerOperand());
+ }
+
+ BoUpSLP::OrdersType Order;
+ if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
+ return Order;
+ return None;
+}
+
Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
bool TopToBottom) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -3037,6 +3696,9 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
}
if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
return CurrentOrder;
+ if (TE.Scalars.size() >= 4)
+ if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
+ return Order;
}
return None;
}
@@ -3047,13 +3709,55 @@ void BoUpSLP::reorderTopToBottom() {
// ExtractElement gather nodes which can be vectorized and need to handle
// their ordering.
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+
+ // AltShuffles can also have a preferred ordering that leads to fewer
+ // instructions, e.g., the addsub instruction in x86.
+ DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
+
+ // Maps a TreeEntry to the reorder indices of external users.
+ DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
+ ExternalUserReorderMap;
+ // FIXME: Workaround for syntax error reported by MSVC buildbots.
+ TargetTransformInfo &TTIRef = *TTI;
// Find all reorderable nodes with the given VF.
// Currently the are vectorized stores,loads,extracts + some gathering of
// extracts.
- for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
+ for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
+ &GathersToOrders, &ExternalUserReorderMap,
+ &AltShufflesToOrders](
const std::unique_ptr<TreeEntry> &TE) {
+ // Look for external users that will probably be vectorized.
+ SmallVector<OrdersType, 1> ExternalUserReorderIndices =
+ findExternalStoreUsersReorderIndices(TE.get());
+ if (!ExternalUserReorderIndices.empty()) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ ExternalUserReorderMap.try_emplace(TE.get(),
+ std::move(ExternalUserReorderIndices));
+ }
+
+ // Patterns like [fadd,fsub] can be combined into a single instruction in
+ // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
+ // to take into account their order when looking for the most used order.
+ if (TE->isAltShuffle()) {
+ VectorType *VecTy =
+ FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
+ unsigned Opcode0 = TE->getOpcode();
+ unsigned Opcode1 = TE->getAltOpcode();
+ // The opcode mask selects between the two opcodes.
+ SmallBitVector OpcodeMask(TE->Scalars.size(), 0);
+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
+ if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
+ OpcodeMask.set(Lane);
+ // If this pattern is supported by the target then we consider the order.
+ if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+ VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
+ }
+ // TODO: Check the reverse order too.
+ }
+
if (Optional<OrdersType> CurrentOrder =
- getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
+ getReorderingData(*TE, /*TopToBottom=*/true)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
// better to reorder them during bottom-to-top stage. If follow the order
// here, it causes reordering of the whole graph though actually it is
@@ -3071,10 +3775,7 @@ void BoUpSLP::reorderTopToBottom() {
EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
}))
return;
- if (UserTE->UserTreeIndices.empty())
- UserTE = nullptr;
- else
- UserTE = UserTE->UserTreeIndices.back().UserTE;
+ UserTE = UserTE->UserTreeIndices.back().UserTE;
++Cnt;
}
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
@@ -3105,11 +3806,30 @@ void BoUpSLP::reorderTopToBottom() {
if (!OpTE->ReuseShuffleIndices.empty())
continue;
// Count number of orders uses.
- const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
- if (OpTE->State == TreeEntry::NeedToGather)
- return GathersToOrders.find(OpTE)->second;
+ const auto &Order = [OpTE, &GathersToOrders,
+ &AltShufflesToOrders]() -> const OrdersType & {
+ if (OpTE->State == TreeEntry::NeedToGather) {
+ auto It = GathersToOrders.find(OpTE);
+ if (It != GathersToOrders.end())
+ return It->second;
+ }
+ if (OpTE->isAltShuffle()) {
+ auto It = AltShufflesToOrders.find(OpTE);
+ if (It != AltShufflesToOrders.end())
+ return It->second;
+ }
return OpTE->ReorderIndices;
}();
+ // First consider the order of the external scalar users.
+ auto It = ExternalUserReorderMap.find(OpTE);
+ if (It != ExternalUserReorderMap.end()) {
+ const auto &ExternalUserReorderIndices = It->second;
+ for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
+ ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+ // No other useful reorder data in this entry.
+ if (Order.empty())
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3199,6 +3919,57 @@ void BoUpSLP::reorderTopToBottom() {
}
}
+bool BoUpSLP::canReorderOperands(
+ TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+ ArrayRef<TreeEntry *> ReorderableGathers,
+ SmallVectorImpl<TreeEntry *> &GatherOps) {
+ for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
+ if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
+ return OpData.first == I &&
+ OpData.second->State == TreeEntry::Vectorize;
+ }))
+ continue;
+ if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+ // Do not reorder if operand node is used by many user nodes.
+ if (any_of(TE->UserTreeIndices,
+ [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
+ return false;
+ // Add the node to the list of the ordered nodes with the identity
+ // order.
+ Edges.emplace_back(I, TE);
+ // Add ScatterVectorize nodes to the list of operands, where just
+ // reordering of the scalars is required. Similar to the gathers, so
+ // simply add to the list of gathered ops.
+ // If there are reused scalars, process this node as a regular vectorize
+ // node, just reorder reuses mask.
+ if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
+ GatherOps.push_back(TE);
+ continue;
+ }
+ TreeEntry *Gather = nullptr;
+ if (count_if(ReorderableGathers,
+ [&Gather, UserTE, I](TreeEntry *TE) {
+ assert(TE->State != TreeEntry::Vectorize &&
+ "Only non-vectorized nodes are expected.");
+ if (any_of(TE->UserTreeIndices,
+ [UserTE, I](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE && EI.EdgeIdx == I;
+ })) {
+ assert(TE->isSame(UserTE->getOperand(I)) &&
+ "Operand entry does not match operands.");
+ Gather = TE;
+ return true;
+ }
+ return false;
+ }) > 1 &&
+ !all_of(UserTE->getOperand(I), isConstant))
+ return false;
+ if (Gather)
+ GatherOps.push_back(Gather);
+ }
+ return true;
+}
+
void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SetVector<TreeEntry *> OrderedEntries;
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
@@ -3212,49 +3983,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
if (Optional<OrdersType> CurrentOrder =
- getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
+ getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
});
- // Checks if the operands of the users are reordarable and have only single
- // use.
- auto &&CheckOperands =
- [this, &NonVectorized](const auto &Data,
- SmallVectorImpl<TreeEntry *> &GatherOps) {
- for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
- if (any_of(Data.second,
- [I](const std::pair<unsigned, TreeEntry *> &OpData) {
- return OpData.first == I &&
- OpData.second->State == TreeEntry::Vectorize;
- }))
- continue;
- ArrayRef<Value *> VL = Data.first->getOperand(I);
- const TreeEntry *TE = nullptr;
- const auto *It = find_if(VL, [this, &TE](Value *V) {
- TE = getTreeEntry(V);
- return TE;
- });
- if (It != VL.end() && TE->isSame(VL))
- return false;
- TreeEntry *Gather = nullptr;
- if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
- assert(TE->State != TreeEntry::Vectorize &&
- "Only non-vectorized nodes are expected.");
- if (TE->isSame(VL)) {
- Gather = TE;
- return true;
- }
- return false;
- }) > 1)
- return false;
- if (Gather)
- GatherOps.push_back(Gather);
- }
- return true;
- };
// 1. Propagate order to the graph nodes, which use only reordered nodes.
// I.e., if the node has operands, that are reordered, try to make at least
// one operand order in the natural order and reorder others + reorder the
@@ -3263,7 +3998,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
while (!OrderedEntries.empty()) {
// 1. Filter out only reordered nodes.
// 2. If the entry has multiple uses - skip it and jump to the next node.
- MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
+ DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
@@ -3291,10 +4026,17 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Erase filtered entries.
for_each(Filtered,
[&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
- for (const auto &Data : Users) {
+ SmallVector<
+ std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
+ UsersVec(Users.begin(), Users.end());
+ sort(UsersVec, [](const auto &Data1, const auto &Data2) {
+ return Data1.first->Idx > Data2.first->Idx;
+ });
+ for (auto &Data : UsersVec) {
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
- if (!CheckOperands(Data, GatherOps)) {
+ if (!canReorderOperands(Data.first, Data.second, NonVectorized,
+ GatherOps)) {
for_each(Data.second,
[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
OrderedEntries.remove(Op.second);
@@ -3310,18 +4052,22 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// the same node my be considered several times, though might be not
// profitable.
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+ SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
for (const auto &Op : Data.second) {
TreeEntry *OpTE = Op.second;
if (!VisitedOps.insert(OpTE).second)
continue;
- if (!OpTE->ReuseShuffleIndices.empty() ||
- (IgnoreReorder && OpTE == VectorizableTree.front().get()))
+ if (!OpTE->ReuseShuffleIndices.empty())
continue;
const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
if (OpTE->State == TreeEntry::NeedToGather)
return GathersToOrders.find(OpTE)->second;
return OpTE->ReorderIndices;
}();
+ unsigned NumOps = count_if(
+ Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
+ return P.second == OpTE;
+ });
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3333,14 +4079,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
- ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+ OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
+ NumOps;
} else {
- ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+ }
+ auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+ const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
+ const TreeEntry *TE) {
+ if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
+ (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
+ (IgnoreReorder && TE->Idx == 0))
+ return true;
+ if (TE->State == TreeEntry::NeedToGather) {
+ auto It = GathersToOrders.find(TE);
+ if (It != GathersToOrders.end())
+ return !It->second.empty();
+ return true;
+ }
+ return false;
+ };
+ for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
+ TreeEntry *UserTE = EI.UserTE;
+ if (!VisitedUsers.insert(UserTE).second)
+ continue;
+ // May reorder user node if it requires reordering, has reused
+ // scalars, is an alternate op vectorize node or its op nodes require
+ // reordering.
+ if (AllowsReordering(UserTE))
+ continue;
+ // Check if users allow reordering.
+ // Currently look up just 1 level of operands to avoid increase of
+ // the compile time.
+ // Profitable to reorder if definitely more operands allow
+ // reordering rather than those with natural order.
+ ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
+ if (static_cast<unsigned>(count_if(
+ Ops, [UserTE, &AllowsReordering](
+ const std::pair<unsigned, TreeEntry *> &Op) {
+ return AllowsReordering(Op.second) &&
+ all_of(Op.second->UserTreeIndices,
+ [UserTE](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE;
+ });
+ })) <= Ops.size() / 2)
+ ++Res.first->second;
}
- OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
- OpTE->UserTreeIndices.size();
- assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
- --OrdersUses[{}];
}
// If no orders - skip current nodes and jump to the next one, if any.
if (OrdersUses.empty()) {
@@ -3381,7 +4165,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
OrderedEntries.remove(TE);
if (!VisitedOps.insert(TE).second)
continue;
- if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
+ if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
// Just reorder reuses indices.
reorderReuses(TE->ReuseShuffleIndices, Mask);
continue;
@@ -3393,6 +4177,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
TE->ReorderIndices.empty()) &&
"Non-matching sizes of user/operand entries.");
reorderOrder(TE->ReorderIndices, Mask);
+ if (IgnoreReorder && TE == VectorizableTree.front().get())
+ IgnoreReorder = false;
}
// For gathers just need to reorder its scalars.
for (TreeEntry *Gather : GatherOps) {
@@ -3484,7 +4270,7 @@ void BoUpSLP::buildExternalUses(
}
// Ignore users in the user ignore list.
- if (is_contained(UserIgnoreList, UserInst))
+ if (UserIgnoreList && UserIgnoreList->contains(UserInst))
continue;
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
@@ -3495,78 +4281,270 @@ void BoUpSLP::buildExternalUses(
}
}
+DenseMap<Value *, SmallVector<StoreInst *, 4>>
+BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
+ DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
+ Value *V = TE->Scalars[Lane];
+ // To save compilation time we don't visit if we have too many users.
+ static constexpr unsigned UsersLimit = 4;
+ if (V->hasNUsesOrMore(UsersLimit))
+ break;
+
+ // Collect stores per pointer object.
+ for (User *U : V->users()) {
+ auto *SI = dyn_cast<StoreInst>(U);
+ if (SI == nullptr || !SI->isSimple() ||
+ !isValidElementType(SI->getValueOperand()->getType()))
+ continue;
+ // Skip entry if already
+ if (getTreeEntry(U))
+ continue;
+
+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+ auto &StoresVec = PtrToStoresMap[Ptr];
+ // For now just keep one store per pointer object per lane.
+ // TODO: Extend this to support multiple stores per pointer per lane
+ if (StoresVec.size() > Lane)
+ continue;
+ // Skip if in different BBs.
+ if (!StoresVec.empty() &&
+ SI->getParent() != StoresVec.back()->getParent())
+ continue;
+ // Make sure that the stores are of the same type.
+ if (!StoresVec.empty() &&
+ SI->getValueOperand()->getType() !=
+ StoresVec.back()->getValueOperand()->getType())
+ continue;
+ StoresVec.push_back(SI);
+ }
+ }
+ return PtrToStoresMap;
+}
+
+bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ OrdersType &ReorderIndices) const {
+ // We check whether the stores in StoreVec can form a vector by sorting them
+ // and checking whether they are consecutive.
+
+ // To avoid calling getPointersDiff() while sorting we create a vector of
+ // pairs {store, offset from first} and sort this instead.
+ SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
+ StoreInst *S0 = StoresVec[0];
+ StoreOffsetVec[0] = {S0, 0};
+ Type *S0Ty = S0->getValueOperand()->getType();
+ Value *S0Ptr = S0->getPointerOperand();
+ for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
+ StoreInst *SI = StoresVec[Idx];
+ Optional<int> Diff =
+ getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
+ SI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true);
+ // We failed to compare the pointers so just abandon this StoresVec.
+ if (!Diff)
+ return false;
+ StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
+ }
+
+ // Sort the vector based on the pointers. We create a copy because we may
+ // need the original later for calculating the reorder (shuffle) indices.
+ stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
+ const std::pair<StoreInst *, int> &Pair2) {
+ int Offset1 = Pair1.second;
+ int Offset2 = Pair2.second;
+ return Offset1 < Offset2;
+ });
+
+ // Check if the stores are consecutive by checking if their difference is 1.
+ for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
+ if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
+ return false;
+
+ // Calculate the shuffle indices according to their offset against the sorted
+ // StoreOffsetVec.
+ ReorderIndices.reserve(StoresVec.size());
+ for (StoreInst *SI : StoresVec) {
+ unsigned Idx = find_if(StoreOffsetVec,
+ [SI](const std::pair<StoreInst *, int> &Pair) {
+ return Pair.first == SI;
+ }) -
+ StoreOffsetVec.begin();
+ ReorderIndices.push_back(Idx);
+ }
+ // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
+ // reorderTopToBottom() and reorderBottomToTop(), so we are following the
+ // same convention here.
+ auto IsIdentityOrder = [](const OrdersType &Order) {
+ for (unsigned Idx : seq<unsigned>(0, Order.size()))
+ if (Idx != Order[Idx])
+ return false;
+ return true;
+ };
+ if (IsIdentityOrder(ReorderIndices))
+ ReorderIndices.clear();
+
+ return true;
+}
+
+#ifndef NDEBUG
+LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
+ for (unsigned Idx : Order)
+ dbgs() << Idx << ", ";
+ dbgs() << "\n";
+}
+#endif
+
+SmallVector<BoUpSLP::OrdersType, 1>
+BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
+ unsigned NumLanes = TE->Scalars.size();
+
+ DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
+ collectUserStores(TE);
+
+ // Holds the reorder indices for each candidate store vector that is a user of
+ // the current TreeEntry.
+ SmallVector<OrdersType, 1> ExternalReorderIndices;
+
+ // Now inspect the stores collected per pointer and look for vectorization
+ // candidates. For each candidate calculate the reorder index vector and push
+ // it into `ExternalReorderIndices`
+ for (const auto &Pair : PtrToStoresMap) {
+ auto &StoresVec = Pair.second;
+ // If we have fewer than NumLanes stores, then we can't form a vector.
+ if (StoresVec.size() != NumLanes)
+ continue;
+
+ // If the stores are not consecutive then abandon this StoresVec.
+ OrdersType ReorderIndices;
+ if (!CanFormVector(StoresVec, ReorderIndices))
+ continue;
+
+ // We now know that the scalars in StoresVec can form a vector instruction,
+ // so set the reorder indices.
+ ExternalReorderIndices.push_back(ReorderIndices);
+ }
+ return ExternalReorderIndices;
+}
+
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst) {
+ const SmallDenseSet<Value *> &UserIgnoreLst) {
deleteTree();
- UserIgnoreList = UserIgnoreLst;
+ UserIgnoreList = &UserIgnoreLst;
if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0, EdgeInfo());
}
-namespace {
-/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize };
-} // anonymous namespace
-
-/// Checks if the given array of loads can be represented as a vectorized,
-/// scatter or just simple gather.
-static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
- const TargetTransformInfo &TTI,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &Order,
- SmallVectorImpl<Value *> &PointerOps) {
- // Check that a vectorized load would load the same memory as a scalar
- // load. For example, we don't want to vectorize loads that are smaller
- // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
- // treats loading/storing it as an i8 struct. If we vectorize loads/stores
- // from such a struct, we read/write packed bits disagreeing with the
- // unvectorized version.
- Type *ScalarTy = VL0->getType();
-
- if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
- return LoadsState::Gather;
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+ deleteTree();
+ if (!allSameType(Roots))
+ return;
+ buildTree_rec(Roots, 0, EdgeInfo());
+}
- // Make sure all loads in the bundle are simple - we can't vectorize
- // atomic or volatile loads.
- PointerOps.clear();
- PointerOps.resize(VL.size());
- auto *POIter = PointerOps.begin();
+/// \return true if the specified list of values has only one instruction that
+/// requires scheduling, false otherwise.
+#ifndef NDEBUG
+static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
+ Value *NeedsScheduling = nullptr;
for (Value *V : VL) {
- auto *L = cast<LoadInst>(V);
- if (!L->isSimple())
- return LoadsState::Gather;
- *POIter = L->getPointerOperand();
- ++POIter;
+ if (doesNotNeedToBeScheduled(V))
+ continue;
+ if (!NeedsScheduling) {
+ NeedsScheduling = V;
+ continue;
+ }
+ return false;
}
+ return NeedsScheduling;
+}
+#endif
- Order.clear();
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
- Value *Ptr0;
- Value *PtrN;
- if (Order.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+ Value *V, const TargetLibraryInfo *TLI,
+ function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
+ bool AllowAlternate) {
+ hash_code Key = hash_value(V->getValueID() + 2);
+ hash_code SubKey = hash_value(0);
+ // Sort the loads by the distance between the pointers.
+ if (auto *LI = dyn_cast<LoadInst>(V)) {
+ Key = hash_combine(hash_value(Instruction::Load), Key);
+ if (LI->isSimple())
+ SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+ else
+ SubKey = hash_value(LI);
+ } else if (isVectorLikeInstWithConstOps(V)) {
+ // Sort extracts by the vector operands.
+ if (isa<ExtractElementInst, UndefValue>(V))
+ Key = hash_value(Value::UndefValueVal + 1);
+ if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+ if (!isUndefVector(EI->getVectorOperand()) &&
+ !isa<UndefValue>(EI->getIndexOperand()))
+ SubKey = hash_value(EI->getVectorOperand());
+ }
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ // Sort other instructions just by the opcodes except for CMPInst.
+ // For CMP also sort by the predicate kind.
+ if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+ isValidForAlternation(I->getOpcode())) {
+ if (AllowAlternate)
+ Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
+ else
+ Key = hash_combine(hash_value(I->getOpcode()), Key);
+ SubKey = hash_combine(
+ hash_value(I->getOpcode()), hash_value(I->getType()),
+ hash_value(isa<BinaryOperator>(I)
+ ? I->getType()
+ : cast<CastInst>(I)->getOperand(0)->getType()));
+ // For casts, look through the only operand to improve compile time.
+ if (isa<CastInst>(I)) {
+ std::pair<size_t, size_t> OpVals =
+ generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
+ /*=AllowAlternate*/ true);
+ Key = hash_combine(OpVals.first, Key);
+ SubKey = hash_combine(OpVals.first, SubKey);
+ }
+ } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+ CmpInst::Predicate Pred = CI->getPredicate();
+ if (CI->isCommutative())
+ Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
+ CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
+ hash_value(SwapPred),
+ hash_value(CI->getOperand(0)->getType()));
+ } else if (auto *Call = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+ if (isTriviallyVectorizable(ID)) {
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+ } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
+ SubKey = hash_combine(hash_value(I->getOpcode()),
+ hash_value(Call->getCalledFunction()));
+ } else {
+ Key = hash_combine(hash_value(Call), Key);
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+ }
+ for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+ SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+ hash_value(Op.Tag), SubKey);
+ } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+ if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+ SubKey = hash_value(Gep->getPointerOperand());
+ else
+ SubKey = hash_value(Gep);
+ } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+ !isa<ConstantInt>(I->getOperand(1))) {
+ // Do not try to vectorize instructions with potentially high cost.
+ SubKey = hash_value(I);
} else {
- Ptr0 = PointerOps[Order.front()];
- PtrN = PointerOps[Order.back()];
+ SubKey = hash_value(I->getOpcode());
}
- Optional<int> Diff =
- getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
- // Check that the sorted loads are consecutive.
- if (static_cast<unsigned>(*Diff) == VL.size() - 1)
- return LoadsState::Vectorize;
- Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
- for (Value *V : VL)
- CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
- if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
- CommonAlignment))
- return LoadsState::ScatterVectorize;
+ Key = hash_combine(hash_value(I->getParent()), Key);
}
-
- return LoadsState::Gather;
+ return std::make_pair(Key, SubKey);
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
@@ -3651,10 +4629,84 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// If all of the operands are identical or constant we have a simple solution.
// If we deal with insert/extract instructions, they all must have constant
// indices, otherwise we should gather them, not try to vectorize.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
- (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
- !all_of(VL, isVectorLikeInstWithConstOps))) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ // If alternate op node with 2 elements with gathered operands - do not
+ // vectorize.
+ auto &&NotProfitableForVectorization = [&S, this,
+ Depth](ArrayRef<Value *> VL) {
+ if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
+ return false;
+ if (VectorizableTree.size() < MinTreeSize)
+ return false;
+ if (Depth >= RecursionMaxDepth - 1)
+ return true;
+ // Check if all operands are extracts, part of vector node or can build a
+ // regular vectorize node.
+ SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
+ for (Value *V : VL) {
+ auto *I = cast<Instruction>(V);
+ InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
+ return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
+ }));
+ }
+ bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
+ if ((IsCommutative &&
+ std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
+ (!IsCommutative &&
+ all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
+ return true;
+ assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+ auto *I1 = cast<Instruction>(VL.front());
+ auto *I2 = cast<Instruction>(VL.back());
+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand(Op));
+ if (static_cast<unsigned>(count_if(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+ })) >= S.MainOp->getNumOperands() / 2)
+ return false;
+ if (S.MainOp->getNumOperands() > 2)
+ return true;
+ if (IsCommutative) {
+ // Check permuted operands.
+ Candidates.clear();
+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand((Op + 1) % E));
+ if (any_of(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+ }))
+ return false;
+ }
+ return true;
+ };
+ SmallVector<unsigned> SortedIndices;
+ BasicBlock *BB = nullptr;
+ bool AreAllSameInsts =
+ (S.getOpcode() && allSameBlock(VL)) ||
+ (S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+ VL.size() > 2 &&
+ all_of(VL,
+ [&BB](Value *V) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ return doesNotNeedToBeScheduled(V);
+ if (!BB)
+ BB = I->getParent();
+ return BB == I->getParent() && I->getNumOperands() == 2;
+ }) &&
+ BB &&
+ sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
+ SortedIndices));
+ if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||
+ (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
+ S.OpValue) &&
+ !all_of(VL, isVectorLikeInstWithConstOps)) ||
+ NotProfitableForVectorization(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
@@ -3665,12 +4717,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// the same block.
// Don't vectorize ephemeral values.
- for (Value *V : VL) {
- if (EphValues.count(V)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
- << ") is ephemeral.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
+ if (!EphValues.empty()) {
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is ephemeral.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
}
}
@@ -3708,20 +4762,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
- for (Value *V : VL) {
- if (is_contained(UserIgnoreList, V)) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- if (TryToFindDuplicates(S))
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
+ if (UserIgnoreList && !UserIgnoreList->empty()) {
+ for (Value *V : VL) {
+ if (UserIgnoreList && UserIgnoreList->contains(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
}
}
+ // Special processing for sorted pointers for ScatterVectorize node with
+ // constant indeces only.
+ if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&
+ UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {
+ assert(S.OpValue->getType()->isPointerTy() &&
+ count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
+ 2 &&
+ "Expected pointers only.");
+ // Reset S to make it GetElementPtr kind of node.
+ const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
+ assert(It != VL.end() && "Expected at least one GEP.");
+ S = getSameOpcode(*It);
+ }
+
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
auto *VL0 = cast<Instruction>(S.OpValue);
- BasicBlock *BB = VL0->getParent();
+ BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
@@ -3739,9 +4810,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
- BlockScheduling &BS = *BSRef.get();
+ BlockScheduling &BS = *BSRef;
Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+#ifdef EXPENSIVE_CHECKS
+ // Make sure we didn't break any internal invariants
+ BS.verify();
+#endif
if (!Bundle) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
@@ -3761,10 +4836,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check for terminator values (e.g. invoke).
for (Value *V : VL)
- for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
- Instruction *Term = dyn_cast<Instruction>(
- cast<PHINode>(V)->getIncomingValueForBlock(
- PH->getIncomingBlock(I)));
+ for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
+ Instruction *Term = dyn_cast<Instruction>(Incoming);
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -3908,7 +4981,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
TreeEntry *TE = nullptr;
- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
+ switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,
PointerOps)) {
case LoadsState::Vectorize:
if (CurrentOrder.empty()) {
@@ -4089,7 +5162,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (Value *V : VL) {
- if (cast<Instruction>(V)->getNumOperands() != 2) {
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ if (I->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4100,9 +5176,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We can't combine several GEPs into one vector if they operate on
// different types.
- Type *Ty0 = VL0->getOperand(0)->getType();
+ Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
for (Value *V : VL) {
- Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+ auto *GEP = dyn_cast<GEPOperator>(V);
+ if (!GEP)
+ continue;
+ Type *CurTy = GEP->getSourceElementType();
if (Ty0 != CurTy) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (different types).\n");
@@ -4113,15 +5192,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
+ bool IsScatterUser =
+ UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
// We don't combine GEPs with non-constant indexes.
Type *Ty1 = VL0->getOperand(1)->getType();
for (Value *V : VL) {
- auto Op = cast<Instruction>(V)->getOperand(1);
- if (!isa<ConstantInt>(Op) ||
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I)
+ continue;
+ auto *Op = I->getOperand(1);
+ if ((!IsScatterUser && !isa<ConstantInt>(Op)) ||
(Op->getType() != Ty1 &&
- Op->getType()->getScalarSizeInBits() >
- DL->getIndexSizeInBits(
- V->getType()->getPointerAddressSpace()))) {
+ ((IsScatterUser && !isa<ConstantInt>(Op)) ||
+ Op->getType()->getScalarSizeInBits() >
+ DL->getIndexSizeInBits(
+ V->getType()->getPointerAddressSpace())))) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
@@ -4136,9 +5222,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
SmallVector<ValueList, 2> Operands(2);
// Prepare the operand vector for pointer operands.
- for (Value *V : VL)
- Operands.front().push_back(
- cast<GetElementPtrInst>(V)->getPointerOperand());
+ for (Value *V : VL) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(V);
+ if (!GEP) {
+ Operands.front().push_back(V);
+ continue;
+ }
+ Operands.front().push_back(GEP->getPointerOperand());
+ }
TE->setOperand(0, Operands.front());
// Need to cast all indices to the same type before vectorization to
// avoid crash.
@@ -4149,9 +5240,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
Type *Ty = all_of(VL,
[VL0Ty, IndexIdx](Value *V) {
- return VL0Ty == cast<GetElementPtrInst>(V)
- ->getOperand(IndexIdx)
- ->getType();
+ auto *GEP = dyn_cast<GetElementPtrInst>(V);
+ if (!GEP)
+ return true;
+ return VL0Ty == GEP->getOperand(IndexIdx)->getType();
})
? VL0Ty
: DL->getIndexType(cast<GetElementPtrInst>(VL0)
@@ -4159,10 +5251,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
->getScalarType());
// Prepare the operand vector.
for (Value *V : VL) {
- auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
- auto *CI = cast<ConstantInt>(Op);
- Operands.back().push_back(ConstantExpr::getIntegerCast(
- CI, Ty, CI->getValue().isSignBitSet()));
+ auto *I = dyn_cast<GetElementPtrInst>(V);
+ if (!I) {
+ Operands.back().push_back(
+ ConstantInt::get(Ty, 0, /*isSigned=*/false));
+ continue;
+ }
+ auto *Op = I->getOperand(IndexIdx);
+ auto *CI = dyn_cast<ConstantInt>(Op);
+ if (!CI)
+ Operands.back().push_back(Op);
+ else
+ Operands.back().push_back(ConstantExpr::getIntegerCast(
+ CI, Ty, CI->getValue().isSignBitSet()));
}
TE->setOperand(IndexIdx, Operands.back());
@@ -4268,7 +5369,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
unsigned NumArgs = CI->arg_size();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
- if (hasVectorInstrinsicScalarOpd(ID, j))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
ScalarArgs[j] = CI->getArgOperand(j);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -4287,7 +5388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned j = 0; j != NumArgs; ++j) {
- if (hasVectorInstrinsicScalarOpd(ID, j)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
@@ -4320,7 +5421,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
// For scalar operands no need to to create an entry since no need to
// vectorize it.
- if (hasVectorInstrinsicScalarOpd(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
continue;
ValueList Operands;
// Prepare the operand vector.
@@ -4347,9 +5448,42 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
- if (isa<BinaryOperator>(VL0)) {
+ auto *CI = dyn_cast<CmpInst>(VL0);
+ if (isa<BinaryOperator>(VL0) || CI) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ if (!CI || all_of(VL, [](Value *V) {
+ return cast<CmpInst>(V)->isCommutative();
+ })) {
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ } else {
+ CmpInst::Predicate P0 = CI->getPredicate();
+ CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
+ assert(P0 != AltP0 &&
+ "Expected different main/alternate predicates.");
+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+ Value *BaseOp0 = VL0->getOperand(0);
+ Value *BaseOp1 = VL0->getOperand(1);
+ // Collect operands - commute if it uses the swapped predicate or
+ // alternate operation.
+ for (Value *V : VL) {
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ CmpInst::Predicate CurrentPred = Cmp->getPredicate();
+ if (P0 == AltP0Swapped) {
+ if (CI != Cmp && S.AltOp != Cmp &&
+ ((P0 == CurrentPred &&
+ !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
+ (AltP0 == CurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))
+ std::swap(LHS, RHS);
+ } else if (P0 != CurrentPred && AltP0 != CurrentPred) {
+ std::swap(LHS, RHS);
+ }
+ Left.push_back(LHS);
+ Right.push_back(RHS);
+ }
+ }
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -4493,7 +5627,9 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const {
return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
all_of(I->users(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
+ return ScalarToTreeEntry.count(U) > 0 ||
+ isVectorLikeInstWithConstOps(U) ||
+ (isa<ExtractElementInst>(U) && MustGather.contains(U));
});
}
@@ -4550,19 +5686,21 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a shuffle
// to extract the values into a vector register.
+ SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
for (auto *V : VL) {
++Idx;
- // Need to exclude undefs from analysis.
- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
- continue;
-
// Reached the start of a new vector registers.
if (Idx % EltsPerVector == 0) {
+ RegMask.assign(EltsPerVector, UndefMaskElem);
AllConsecutive = true;
continue;
}
+ // Need to exclude undefs from analysis.
+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
+ continue;
+
// Check all extracts for a vector register on the target directly
// extract values in order.
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
@@ -4570,6 +5708,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
}
if (AllConsecutive)
@@ -4581,10 +5720,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// If we have a series of extracts which are not consecutive and hence
// cannot re-use the source vector register directly, compute the shuffle
- // cost to extract the a vector with EltsPerVector elements.
+ // cost to extract the vector with EltsPerVector elements.
Cost += TTI.getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
}
return Cost;
}
@@ -4592,12 +5731,12 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
/// Build shuffle mask for shuffle graph entries and lists of main and alternate
/// operations operands.
static void
-buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
- ArrayRef<int> ReusesIndices,
- const function_ref<bool(Instruction *)> IsAltOp,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<Value *> *OpScalars = nullptr,
- SmallVectorImpl<Value *> *AltScalars = nullptr) {
+buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
+ ArrayRef<int> ReusesIndices,
+ const function_ref<bool(Instruction *)> IsAltOp,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars = nullptr,
+ SmallVectorImpl<Value *> *AltScalars = nullptr) {
unsigned Sz = VL.size();
Mask.assign(Sz, UndefMaskElem);
SmallVector<int> OrderMask;
@@ -4627,6 +5766,29 @@ buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
}
}
+/// Checks if the specified instruction \p I is an alternate operation for the
+/// given \p MainOp and \p AltOp instructions.
+static bool isAlternateInstruction(const Instruction *I,
+ const Instruction *MainOp,
+ const Instruction *AltOp) {
+ if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {
+ auto *AltCI0 = cast<CmpInst>(AltOp);
+ auto *CI = cast<CmpInst>(I);
+ CmpInst::Predicate P0 = CI0->getPredicate();
+ CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+ assert(P0 != AltP0 && "Expected different main/alternate predicates.");
+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+ CmpInst::Predicate CurrentPred = CI->getPredicate();
+ if (P0 == AltP0Swapped)
+ return I == AltCI0 ||
+ (I != MainOp &&
+ !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+ CI->getOperand(0), CI->getOperand(1)));
+ return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+ }
+ return I->getOpcode() == AltOp->getOpcode();
+}
+
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals) {
ArrayRef<Value*> VL = E->Scalars;
@@ -4740,7 +5902,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);
- if (Shuffle.hasValue()) {
+ if (Shuffle) {
InstructionCost GatherCost = 0;
if (ShuffleVectorInst::isIdentityMask(Mask)) {
// Perfect match in the graph, will reuse the previously vectorized
@@ -4776,7 +5938,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
SmallVector<int> Mask;
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
isFixedVectorShuffle(VL, Mask);
- if (ShuffleKind.hasValue()) {
+ if (ShuffleKind) {
// Found the bunch of extractelement instructions that must be gathered
// into a vector and can be represented as a permutation elements in a
// single input vector or of 2 input vectors.
@@ -4794,7 +5956,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// broadcast.
assert(VecTy == FinalVecTy &&
"No reused scalars expected for broadcast.");
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/None, /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/VL[0]);
}
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
@@ -4818,8 +5982,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
- LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
- *SE, CurrentOrder, PointerOps);
+ LoadsState LS =
+ canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
+ CurrentOrder, PointerOps);
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
@@ -4909,7 +6074,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
- assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ assert(E->getOpcode() &&
+ ((allSameType(VL) && allSameBlock(VL)) ||
+ (E->getOpcode() == Instruction::GetElementPtr &&
+ E->getMainOp()->getType()->isPointerTy())) &&
+ "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -4981,28 +6150,60 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
assert(E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.");
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
-
unsigned const NumElts = SrcVecTy->getNumElements();
unsigned const NumScalars = VL.size();
+
+ unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+
+ unsigned OffsetBeg = *getInsertIndex(VL.front());
+ unsigned OffsetEnd = OffsetBeg;
+ for (Value *V : VL.drop_front()) {
+ unsigned Idx = *getInsertIndex(V);
+ if (OffsetBeg > Idx)
+ OffsetBeg = Idx;
+ else if (OffsetEnd < Idx)
+ OffsetEnd = Idx;
+ }
+ unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
+ if (NumOfParts > 0)
+ VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
+ unsigned VecSz =
+ (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
+ VecScalarsSz;
+ unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
+ unsigned InsertVecSz = std::min<unsigned>(
+ PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
+ ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
+ VecScalarsSz);
+ bool IsWholeSubvector =
+ OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
+ // Check if we can safely insert a subvector. If it is not possible, just
+ // generate a whole-sized vector and shuffle the source vector and the new
+ // subvector.
+ if (OffsetBeg + InsertVecSz > VecSz) {
+ // Align OffsetBeg to generate correct mask.
+ OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
+ InsertVecSz = VecSz;
+ }
+
APInt DemandedElts = APInt::getZero(NumElts);
// TODO: Add support for Instruction::InsertValue.
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
- Mask.append(NumElts - NumScalars, UndefMaskElem);
+ Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
} else {
- Mask.assign(NumElts, UndefMaskElem);
- std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ Mask.assign(VecSz, UndefMaskElem);
+ std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
}
- unsigned Offset = *getInsertIndex(VL0, 0);
bool IsIdentity = true;
- SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+ SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
Mask.swap(PrevMask);
for (unsigned I = 0; I < NumScalars; ++I) {
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
DemandedElts.setBit(InsertIdx);
- IsIdentity &= InsertIdx - Offset == I;
- Mask[InsertIdx - Offset] = I;
+ IsIdentity &= InsertIdx - OffsetBeg == I;
+ Mask[InsertIdx - OffsetBeg] = I;
}
assert(Offset < NumElts && "Failed to find vector index offset");
@@ -5010,32 +6211,41 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
/*Insert*/ true, /*Extract*/ false);
- if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
- // FIXME: Replace with SK_InsertSubvector once it is properly supported.
- unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
- Cost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(SrcVecTy->getElementType(), Sz));
- } else if (!IsIdentity) {
- auto *FirstInsert =
- cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
- return !is_contained(E->Scalars,
- cast<Instruction>(V)->getOperand(0));
- }));
- if (isUndefVector(FirstInsert->getOperand(0))) {
- Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
+ // First cost - resize to actual vector size if not identity shuffle or
+ // need to shift the vector.
+ // Do not calculate the cost if the actual size is the register size and
+ // we can merge this shuffle with the following SK_Select.
+ auto *InsertVecTy =
+ FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+ if (!IsIdentity)
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ InsertVecTy, Mask);
+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+ }));
+ // Second cost - permutation with subvector, if some elements are from the
+ // initial vector or inserting a subvector.
+ // TODO: Implement the analysis of the FirstInsert->getOperand(0)
+ // subvector of ActualVecTy.
+ if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&
+ !IsWholeSubvector) {
+ if (InsertVecSz != VecSz) {
+ auto *ActualVecTy =
+ FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+ Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
+ None, OffsetBeg - Offset, InsertVecTy);
} else {
- SmallVector<int> InsertMask(NumElts);
- std::iota(InsertMask.begin(), InsertMask.end(), 0);
- for (unsigned I = 0; I < NumElts; I++) {
+ for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
+ Mask[I] = I;
+ for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
+ I <= End; ++I)
if (Mask[I] != UndefMaskElem)
- InsertMask[Offset + I] = NumElts + I;
- }
- Cost +=
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
+ Mask[I] = I + VecSz;
+ for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
+ Mask[I] = I;
+ Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
}
}
-
return Cost;
}
case Instruction::ZExt:
@@ -5116,9 +6326,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// If the selects are the only uses of the compares, they will be dead
// and we can adjust the cost by removing their cost.
if (IntrinsicAndUse.second)
- IntrinsicCost -=
- TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
+ MaskTy, VecPred, CostKind);
VecCost = std::min(VecCost, IntrinsicCost);
}
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
@@ -5198,7 +6407,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_UniformConstantValue;
+ any_of(VL,
+ [](Value *V) {
+ return isa<GetElementPtrInst>(V) &&
+ !isConstant(
+ cast<GetElementPtrInst>(V)->getOperand(1));
+ })
+ ? TargetTransformInfo::OK_AnyValue
+ : TargetTransformInfo::OK_UniformConstantValue;
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
@@ -5229,7 +6445,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Align CommonAlignment = Alignment;
for (Value *V : VL)
CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind, VL0);
@@ -5279,7 +6495,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
InstructionCost ScalarCost = 0;
if (NeedToShuffleReuses) {
@@ -5327,6 +6544,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+ Builder.getInt1Ty(),
+ CI0->getPredicate(), CostKind, VL0);
+ VecCost += TTI->getCmpSelInstrCost(
+ E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
+ E->getAltOp());
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
@@ -5338,16 +6563,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI::CastContextHint::None, CostKind);
}
- SmallVector<int> Mask;
- buildSuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
- [E](Instruction *I) {
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
- },
- Mask);
- CommonCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
+ if (E->ReuseShuffleIndices.empty()) {
+ CommonCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
+ } else {
+ SmallVector<int> Mask;
+ buildShuffleEntryMask(
+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask);
+ CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ FinalVecTy, Mask);
+ }
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
return CommonCost + VecCost - ScalarCost;
}
@@ -5475,7 +6705,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// No need to vectorize inserts of gathered values.
if (VectorizableTree.size() == 2 &&
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
- VectorizableTree[1]->State == TreeEntry::NeedToGather)
+ VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+ (VectorizableTree[1]->getVectorFactor() <= 2 ||
+ !(isSplat(VectorizableTree[1]->Scalars) ||
+ allConstant(VectorizableTree[1]->Scalars))))
return true;
// We can vectorize the tree if its size is greater than or equal to the
@@ -5605,20 +6838,26 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
return false;
auto *IE1 = VU;
auto *IE2 = V;
+ unsigned Idx1 = *getInsertIndex(IE1);
+ unsigned Idx2 = *getInsertIndex(IE2);
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
do {
- if (IE2 == VU || IE1 == V)
- return true;
+ if (IE2 == VU)
+ return VU->hasOneUse();
+ if (IE1 == V)
+ return V->hasOneUse();
if (IE1) {
- if (IE1 != VU && !IE1->hasOneUse())
+ if ((IE1 != VU && !IE1->hasOneUse()) ||
+ getInsertIndex(IE1).value_or(Idx2) == Idx2)
IE1 = nullptr;
else
IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
}
if (IE2) {
- if (IE2 != V && !IE2->hasOneUse())
+ if ((IE2 != V && !IE2->hasOneUse()) ||
+ getInsertIndex(IE2).value_or(Idx1) == Idx1)
IE2 = nullptr;
else
IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
@@ -5627,6 +6866,153 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
return false;
}
+/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
+/// buildvector sequence.
+static bool isFirstInsertElement(const InsertElementInst *IE1,
+ const InsertElementInst *IE2) {
+ if (IE1 == IE2)
+ return false;
+ const auto *I1 = IE1;
+ const auto *I2 = IE2;
+ const InsertElementInst *PrevI1;
+ const InsertElementInst *PrevI2;
+ unsigned Idx1 = *getInsertIndex(IE1);
+ unsigned Idx2 = *getInsertIndex(IE2);
+ do {
+ if (I2 == IE1)
+ return true;
+ if (I1 == IE2)
+ return false;
+ PrevI1 = I1;
+ PrevI2 = I2;
+ if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
+ getInsertIndex(I1).value_or(Idx2) != Idx2)
+ I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
+ if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
+ getInsertIndex(I2).value_or(Idx1) != Idx1)
+ I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
+ } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
+ llvm_unreachable("Two different buildvectors not expected.");
+}
+
+namespace {
+/// Returns incoming Value *, if the requested type is Value * too, or a default
+/// value, otherwise.
+struct ValueSelect {
+ template <typename U>
+ static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type
+ get(Value *V) {
+ return V;
+ }
+ template <typename U>
+ static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type
+ get(Value *) {
+ return U();
+ }
+};
+} // namespace
+
+/// Does the analysis of the provided shuffle masks and performs the requested
+/// actions on the vectors with the given shuffle masks. It tries to do it in
+/// several steps.
+/// 1. If the Base vector is not undef vector, resizing the very first mask to
+/// have common VF and perform action for 2 input vectors (including non-undef
+/// Base). Other shuffle masks are combined with the resulting after the 1 stage
+/// and processed as a shuffle of 2 elements.
+/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
+/// action only for 1 vector with the given mask, if it is not the identity
+/// mask.
+/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
+/// vectors, combing the masks properly between the steps.
+template <typename T>
+static T *performExtractsShuffleAction(
+ MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
+ function_ref<unsigned(T *)> GetVF,
+ function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,
+ function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
+ assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
+ SmallVector<int> Mask(ShuffleMask.begin()->second);
+ auto VMIt = std::next(ShuffleMask.begin());
+ T *Prev = nullptr;
+ bool IsBaseNotUndef = !isUndefVector(Base);
+ if (IsBaseNotUndef) {
+ // Base is not undef, need to combine it with the next subvectors.
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
+ if (Mask[Idx] == UndefMaskElem)
+ Mask[Idx] = Idx;
+ else
+ Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
+ }
+ auto *V = ValueSelect::get<T *>(Base);
+ (void)V;
+ assert((!V || GetVF(V) == Mask.size()) &&
+ "Expected base vector of VF number of elements.");
+ Prev = Action(Mask, {nullptr, Res.first});
+ } else if (ShuffleMask.size() == 1) {
+ // Base is undef and only 1 vector is shuffled - perform the action only for
+ // single vector, if the mask is not the identity mask.
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ if (Res.second)
+ // Identity mask is found.
+ Prev = Res.first;
+ else
+ Prev = Action(Mask, {ShuffleMask.begin()->first});
+ } else {
+ // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
+ // shuffles step by step, combining shuffle between the steps.
+ unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
+ unsigned Vec2VF = GetVF(VMIt->first);
+ if (Vec1VF == Vec2VF) {
+ // No need to resize the input vectors since they are of the same size, we
+ // can shuffle them directly.
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (SecMask[I] != UndefMaskElem) {
+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ Mask[I] = SecMask[I] + Vec1VF;
+ }
+ }
+ Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
+ } else {
+ // Vectors of different sizes - resize and reshuffle.
+ std::pair<T *, bool> Res1 =
+ ResizeAction(ShuffleMask.begin()->first, Mask);
+ std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (Mask[I] != UndefMaskElem) {
+ assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (Res1.second)
+ Mask[I] = I;
+ } else if (SecMask[I] != UndefMaskElem) {
+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
+ }
+ }
+ Prev = Action(Mask, {Res1.first, Res2.first});
+ }
+ VMIt = std::next(VMIt);
+ }
+ // Perform requested actions for the remaining masks/vectors.
+ for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
+ // Shuffle other input vectors, if any.
+ std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (SecMask[I] != UndefMaskElem) {
+ assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
+ "Multiple uses of scalars.");
+ Mask[I] = (Res.second ? I : SecMask[I]) + VF;
+ } else if (Mask[I] != UndefMaskElem) {
+ Mask[I] = I;
+ }
+ }
+ Prev = Action(Mask, {Prev, Res.first});
+ }
+ return Prev;
+}
+
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5635,7 +7021,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
- TreeEntry &TE = *VectorizableTree[I].get();
+ TreeEntry &TE = *VectorizableTree[I];
InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;
@@ -5647,9 +7033,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
- SmallVector<unsigned> VF;
- SmallVector<SmallVector<int>> ShuffleMask;
- SmallVector<Value *> FirstUsers;
+ SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
+ SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
@@ -5678,37 +7063,55 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
Optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
- auto *It = find_if(FirstUsers, [VU](Value *V) {
- return areTwoInsertFromSameBuildVector(VU,
- cast<InsertElementInst>(V));
- });
+ const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
+ auto *It =
+ find_if(FirstUsers,
+ [VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+ return areTwoInsertFromSameBuildVector(
+ VU, cast<InsertElementInst>(Pair.first));
+ });
int VecId = -1;
if (It == FirstUsers.end()) {
- VF.push_back(FTy->getNumElements());
- ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+ (void)ShuffleMasks.emplace_back();
+ SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
- while (isa<InsertElementInst>(Base)) {
+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
+ if (IEBase != EU.User &&
+ (!IEBase->hasOneUse() ||
+ getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
+ break;
// Build the mask for the vectorized insertelement instructions.
- if (const TreeEntry *E = getTreeEntry(Base)) {
- VU = cast<InsertElementInst>(Base);
+ if (const TreeEntry *E = getTreeEntry(IEBase)) {
+ VU = IEBase;
do {
- int Idx = E->findLaneForValue(Base);
- ShuffleMask.back()[Idx] = Idx;
- Base = cast<InsertElementInst>(Base)->getOperand(0);
+ IEBase = cast<InsertElementInst>(Base);
+ int Idx = *getInsertIndex(IEBase);
+ assert(Mask[Idx] == UndefMaskElem &&
+ "InsertElementInstruction used already.");
+ Mask[Idx] = Idx;
+ Base = IEBase->getOperand(0);
} while (E == getTreeEntry(Base));
break;
}
Base = cast<InsertElementInst>(Base)->getOperand(0);
}
- FirstUsers.push_back(VU);
- DemandedElts.push_back(APInt::getZero(VF.back()));
+ FirstUsers.emplace_back(VU, ScalarTE);
+ DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
} else {
+ if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
+ It->first = VU;
VecId = std::distance(FirstUsers.begin(), It);
}
- ShuffleMask[VecId][*InsertIdx] = EU.Lane;
- DemandedElts[VecId].setBit(*InsertIdx);
+ int InIdx = *InsertIdx;
+ SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask[InIdx] = EU.Lane;
+ DemandedElts[VecId].setBit(InIdx);
continue;
}
}
@@ -5734,86 +7137,75 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- if (FirstUsers.size() == 1) {
- int Limit = ShuffleMask.front().size() * 2;
- if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
- !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
- InstructionCost C = TTI->getShuffleCost(
+ auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {
+ InstructionCost C = 0;
+ unsigned VF = Mask.size();
+ unsigned VecVF = TE->getVectorFactor();
+ if (VF != VecVF &&
+ (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
+ (all_of(Mask,
+ [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
+ !ShuffleVectorInst::isIdentityMask(Mask)))) {
+ SmallVector<int> OrigMask(VecVF, UndefMaskElem);
+ std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+ OrigMask.begin());
+ C = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc,
- cast<FixedVectorType>(FirstUsers.front()->getType()),
- ShuffleMask.front());
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of insertelement external users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
+ LLVM_DEBUG(
+ dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement external users.\n";
+ TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
Cost += C;
+ return std::make_pair(TE, true);
}
+ return std::make_pair(TE, false);
+ };
+ // Calculate the cost of the reshuffled vectors, if any.
+ for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+ Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
+ unsigned VF = ShuffleMasks[I].begin()->second.size();
+ auto *FTy = FixedVectorType::get(
+ cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
+ auto Vector = ShuffleMasks[I].takeVector();
+ auto &&EstimateShufflesCost = [this, FTy,
+ &Cost](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> TEs) {
+ assert((TEs.size() == 1 || TEs.size() == 2) &&
+ "Expected exactly 1 or 2 tree entries.");
+ if (TEs.size() == 1) {
+ int Limit = 2 * Mask.size();
+ if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
+ !ShuffleVectorInst::isIdentityMask(Mask)) {
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement "
+ "external users.\n";
+ TEs.front()->dump();
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ }
+ } else {
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users.\n";
+ if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ }
+ return TEs.back();
+ };
+ (void)performExtractsShuffleAction<const TreeEntry>(
+ makeMutableArrayRef(Vector.data(), Vector.size()), Base,
+ [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
+ EstimateShufflesCost);
InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers.front()->getType()),
- DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost -= InsertCost;
- } else if (FirstUsers.size() >= 2) {
- unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
- // Combined masks of the first 2 vectors.
- SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
- copy(ShuffleMask.front(), CombinedMask.begin());
- APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
- auto *VecTy = FixedVectorType::get(
- cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
- MaxVF);
- for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
- if (ShuffleMask[1][I] != UndefMaskElem) {
- CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
- CombinedDemandedElts.setBit(I);
- }
- }
- InstructionCost C =
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external "
- "insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
- InstructionCost InsertCost = TTI->getScalarizationOverhead(
- VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
+ /*Insert*/ true, /*Extract*/ false);
Cost -= InsertCost;
- for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
- // Other elements - permutation of 2 vectors (the initial one and the
- // next Ith incoming vector).
- unsigned VF = ShuffleMask[I].size();
- for (unsigned Idx = 0; Idx < VF; ++Idx) {
- int Mask = ShuffleMask[I][Idx];
- if (Mask != UndefMaskElem)
- CombinedMask[Idx] = MaxVF + Mask;
- else if (CombinedMask[Idx] != UndefMaskElem)
- CombinedMask[Idx] = Idx;
- }
- for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
- if (CombinedMask[Idx] != UndefMaskElem)
- CombinedMask[Idx] = Idx;
- InstructionCost C =
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external "
- "insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
- InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
- /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost -= InsertCost;
- }
}
#ifndef NDEBUG
@@ -5906,6 +7298,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
}
}
+ if (UsedTEs.empty()) {
+ assert(all_of(TE->Scalars, UndefValue::classof) &&
+ "Expected vector of undefs only.");
+ return None;
+ }
+
unsigned VF = 0;
if (UsedTEs.size() == 1) {
// Try to find the perfect match in another gather node at first.
@@ -5965,17 +7363,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
return None;
}
-InstructionCost
-BoUpSLP::getGatherCost(FixedVectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices,
- bool NeedToShuffle) const {
- unsigned NumElts = Ty->getNumElements();
- APInt DemandedElts = APInt::getZero(NumElts);
- for (unsigned I = 0; I < NumElts; ++I)
- if (!ShuffledIndices.count(I))
- DemandedElts.setBit(I);
+InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
+ const APInt &ShuffledIndices,
+ bool NeedToShuffle) const {
InstructionCost Cost =
- TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+ TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
/*Extract*/ false);
if (NeedToShuffle)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
@@ -5992,19 +7384,19 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
- DenseSet<unsigned> ShuffledElements;
+ APInt ShuffledElements = APInt::getZero(VL.size());
DenseSet<Value *> UniqueElements;
// Iterate in reverse order to consider insert elements with the high cost.
for (unsigned I = VL.size(); I > 0; --I) {
unsigned Idx = I - 1;
// No need to shuffle duplicates for constants.
if (isConstant(VL[Idx])) {
- ShuffledElements.insert(Idx);
+ ShuffledElements.setBit(Idx);
continue;
}
if (!UniqueElements.insert(VL[Idx]).second) {
DuplicateNonConst = true;
- ShuffledElements.insert(Idx);
+ ShuffledElements.setBit(Idx);
}
}
return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
@@ -6029,14 +7421,83 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// Get the basic block this bundle is in. All instructions in the bundle
- // should be in this block.
+ // should be in this block (except for extractelement-like instructions with
+ // constant indeces).
auto *Front = E->getMainOp();
auto *BB = Front->getParent();
assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+ if (E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(V))
+ return true;
auto *I = cast<Instruction>(V);
- return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
+ isVectorLikeInstWithConstOps(I);
}));
+ auto &&FindLastInst = [E, Front, this, &BB]() {
+ Instruction *LastInst = Front;
+ for (Value *V : E->Scalars) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (LastInst->getParent() == I->getParent()) {
+ if (LastInst->comesBefore(I))
+ LastInst = I;
+ continue;
+ }
+ assert(isVectorLikeInstWithConstOps(LastInst) &&
+ isVectorLikeInstWithConstOps(I) &&
+ "Expected vector-like insts only.");
+ if (!DT->isReachableFromEntry(LastInst->getParent())) {
+ LastInst = I;
+ continue;
+ }
+ if (!DT->isReachableFromEntry(I->getParent()))
+ continue;
+ auto *NodeA = DT->getNode(LastInst->getParent());
+ auto *NodeB = DT->getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
+ LastInst = I;
+ }
+ BB = LastInst->getParent();
+ return LastInst;
+ };
+
+ auto &&FindFirstInst = [E, Front]() {
+ Instruction *FirstInst = Front;
+ for (Value *V : E->Scalars) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (I->comesBefore(FirstInst))
+ FirstInst = I;
+ }
+ return FirstInst;
+ };
+
+ // Set the insert point to the beginning of the basic block if the entry
+ // should not be scheduled.
+ if (E->State != TreeEntry::NeedToGather &&
+ doesNotNeedToSchedule(E->Scalars)) {
+ Instruction *InsertInst;
+ if (all_of(E->Scalars, isUsedOutsideBlock))
+ InsertInst = FindLastInst();
+ else
+ InsertInst = FindFirstInst();
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ if (isa<PHINode>(InsertInst))
+ InsertInst = BB->getFirstNonPHI();
+ BasicBlock::iterator InsertPt = InsertInst->getIterator();
+ Builder.SetInsertPoint(BB, InsertPt);
+ Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+ return;
+ }
+
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
@@ -6045,8 +7506,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
- auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+ Value *V = E->isOneOf(E->Scalars.back());
+ if (doesNotNeedToBeScheduled(V))
+ V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
+ auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
@@ -6072,19 +7535,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
if (!LastInst) {
- SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
- for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
- LastInst = &I;
- if (Bundle.empty())
- break;
- }
+ LastInst = FindLastInst();
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ if (isa<PHINode>(LastInst))
+ LastInst = BB->getFirstNonPHI()->getPrevNode();
}
assert(LastInst && "Failed to find last instruction in bundle");
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+ Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -6214,8 +7674,15 @@ public:
} // namespace
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
- unsigned VF = VL.size();
+ const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL);
+ // Special processing for GEPs bundle, which may include non-gep values.
+ if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
+ const auto *It =
+ find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
+ if (It != VL.end())
+ S = getSameOpcode(*It);
+ }
if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue))
if (E->isSame(VL)) {
@@ -6270,7 +7737,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}
}
- // Check that every instruction appears once in this bundle.
+ // Can't vectorize this, so simply build a new vector with each lane
+ // corresponding to the requested value.
+ return createBuildVector(VL);
+}
+Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
+ assert(any_of(VectorizableTree,
+ [VL](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);
+ }) &&
+ "Non-matching gather node.");
+ unsigned VF = VL.size();
+ // Exploit possible reuse of values across lanes.
SmallVector<int> ReuseShuffleIndicies;
SmallVector<Value *> UniqueValues;
if (VL.size() > 2) {
@@ -6303,6 +7781,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
UndefMaskElem);
} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
+ if (UniqueValues.empty()) {
+ assert(all_of(VL, UndefValue::classof) && "Expected list of undefs.");
+ NumValues = VF;
+ }
ReuseShuffleIndicies.clear();
UniqueValues.clear();
UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
@@ -6342,7 +7824,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);
- if (Shuffle.hasValue()) {
+ if (Shuffle) {
assert((Entries.size() == 1 || Entries.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
@@ -6376,14 +7858,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
- assert(
- (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&
- "PHI reordering is free.");
+ assert((E->ReorderIndices.empty() ||
+ E != VectorizableTree.front().get() ||
+ !E->UserTreeIndices.empty()) &&
+ "PHI reordering is free.");
auto *PH = cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi;
+
+ // Adjust insertion point once all PHI's have been generated.
+ Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+
ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
V = ShuffleBuilder.finalize(V);
@@ -6449,7 +7937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
const unsigned NumScalars = E->Scalars.size();
- unsigned Offset = *getInsertIndex(VL0, 0);
+ unsigned Offset = *getInsertIndex(VL0);
assert(Offset < NumElts && "Failed to find vector index offset");
// Create shuffle to resize vector
@@ -6656,19 +8144,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned AS = LI->getPointerAddressSpace();
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
-
Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
+ NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
// The pointer operand uses an in-tree scalar so we add the new BitCast
- // to ExternalUses list to make sure that an extract will be generated
- // in the future.
+ // or LoadInst to ExternalUses list to make sure that an extract will
+ // be generated in the future.
if (TreeEntry *Entry = getTreeEntry(PO)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(PO);
- ExternalUses.emplace_back(PO, cast<User>(VecPtr), FoundLane);
+ ExternalUses.emplace_back(
+ PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);
}
-
- NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
Value *VecPtr = vectorizeTree(E->getOperand(0));
@@ -6676,7 +8163,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Align CommonAlignment = LI->getAlign();
for (Value *V : E->Scalars)
CommonAlignment =
- commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
}
Value *V = propagateMetadata(NewLI, E->Scalars);
@@ -6701,17 +8188,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
ScalarPtr, VecValue->getType()->getPointerTo(AS));
- StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
- SI->getAlign());
+ StoreInst *ST =
+ Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());
- // The pointer operand uses an in-tree scalar, so add the new BitCast to
- // ExternalUses to make sure that an extract will be generated in the
- // future.
+ // The pointer operand uses an in-tree scalar, so add the new BitCast or
+ // StoreInst to ExternalUses to make sure that an extract will be
+ // generated in the future.
if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);
- ExternalUses.push_back(
- ExternalUser(ScalarPtr, cast<User>(VecPtr), FoundLane));
+ ExternalUses.push_back(ExternalUser(
+ ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,
+ FoundLane));
}
Value *V = propagateMetadata(ST, E->Scalars);
@@ -6733,8 +8221,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
- if (Instruction *I = dyn_cast<Instruction>(V))
- V = propagateMetadata(I, E->Scalars);
+ if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
+ SmallVector<Value *> GEPs;
+ for (Value *V : E->Scalars) {
+ if (isa<GetElementPtrInst>(V))
+ GEPs.push_back(V);
+ }
+ V = propagateMetadata(I, GEPs);
+ }
ShuffleBuilder.addInversedMask(E->ReorderIndices);
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
@@ -6767,11 +8261,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
- if (hasVectorInstrinsicOverloadedScalarOpd(IID, j))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
TysForDecl.push_back(ScalarArg->getType());
continue;
}
@@ -6779,6 +8273,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *OpVec = vectorizeTree(E->getOperand(j));
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ TysForDecl.push_back(OpVec->getType());
}
Function *CF;
@@ -6822,11 +8318,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
Value *LHS = nullptr, *RHS = nullptr;
- if (Instruction::isBinaryOp(E->getOpcode())) {
+ if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
LHS = vectorizeTree(E->getOperand(0));
RHS = vectorizeTree(E->getOperand(1));
@@ -6846,6 +8343,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
V1 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+ auto *AltCI = cast<CmpInst>(E->getAltOp());
+ CmpInst::Predicate AltPred = AltCI->getPredicate();
+ V1 = Builder.CreateCmp(AltPred, LHS, RHS);
} else {
V0 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
@@ -6866,11 +8368,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// each vector operation.
ValueList OpScalars, AltScalars;
SmallVector<int> Mask;
- buildSuffleEntryMask(
+ buildShuffleEntryMask(
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
+ return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
},
Mask, &OpScalars, &AltScalars);
@@ -6901,6 +8403,17 @@ Value *BoUpSLP::vectorizeTree() {
return vectorizeTree(ExternallyUsedValues);
}
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+struct ShuffledInsertData {
+ /// List of insertelements to be replaced by shuffles.
+ SmallVector<InsertElementInst *> InsertElements;
+ /// The parent vectors and shuffle mask for the given list of inserts.
+ MapVector<Value *, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
Value *
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// All blocks must be scheduled before any instructions are inserted.
@@ -6934,6 +8447,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
+ SmallVector<ShuffledInsertData> ShuffledInserts;
+ // Maps vector instruction to original insertelement instruction
+ DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -6947,6 +8463,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(E && "Invalid scalar");
assert(E->State != TreeEntry::NeedToGather &&
"Extracting from a gather list");
+ // Non-instruction pointers are not deleted, just skip them.
+ if (E->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(Scalar))
+ continue;
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
@@ -6973,6 +8493,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?");
+ auto *IE = cast<InsertElementInst>(Scalar);
+ VectorToInsertElement.try_emplace(Vec, IE);
return Vec;
};
// If User == nullptr, the Scalar is used as extra arg. Generate
@@ -7001,6 +8523,69 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
}
+ if (auto *VU = dyn_cast<InsertElementInst>(User)) {
+ // Skip if the scalar is another vector op or Vec is not an instruction.
+ if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
+ if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
+ Optional<unsigned> InsertIdx = getInsertIndex(VU);
+ if (InsertIdx) {
+ // Need to use original vector, if the root is truncated.
+ if (MinBWs.count(Scalar) &&
+ VectorizableTree[0]->VectorizedValue == Vec)
+ Vec = VectorRoot;
+ auto *It =
+ find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
+ // Checks if 2 insertelements are from the same buildvector.
+ InsertElementInst *VecInsert = Data.InsertElements.front();
+ return areTwoInsertFromSameBuildVector(VU, VecInsert);
+ });
+ unsigned Idx = *InsertIdx;
+ if (It == ShuffledInserts.end()) {
+ (void)ShuffledInserts.emplace_back();
+ It = std::next(ShuffledInserts.begin(),
+ ShuffledInserts.size() - 1);
+ SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ // Find the insertvector, vectorized in tree, if any.
+ Value *Base = VU;
+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
+ if (IEBase != User &&
+ (!IEBase->hasOneUse() ||
+ getInsertIndex(IEBase).value_or(Idx) == Idx))
+ break;
+ // Build the mask for the vectorized insertelement instructions.
+ if (const TreeEntry *E = getTreeEntry(IEBase)) {
+ do {
+ IEBase = cast<InsertElementInst>(Base);
+ int IEIdx = *getInsertIndex(IEBase);
+ assert(Mask[Idx] == UndefMaskElem &&
+ "InsertElementInstruction used already.");
+ Mask[IEIdx] = IEIdx;
+ Base = IEBase->getOperand(0);
+ } while (E == getTreeEntry(Base));
+ break;
+ }
+ Base = cast<InsertElementInst>(Base)->getOperand(0);
+ // After the vectorization the def-use chain has changed, need
+ // to look through original insertelement instructions, if they
+ // get replaced by vector instructions.
+ auto It = VectorToInsertElement.find(Base);
+ if (It != VectorToInsertElement.end())
+ Base = It->second;
+ }
+ }
+ SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask[Idx] = ExternalUse.Lane;
+ It->InsertElements.push_back(cast<InsertElementInst>(User));
+ continue;
+ }
+ }
+ }
+ }
+
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -7036,6 +8621,221 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
+ // Checks if the mask is an identity mask.
+ auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {
+ int Limit = Mask.size();
+ return VecTy->getNumElements() == Mask.size() &&
+ all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask);
+ };
+ // Tries to combine 2 different masks into single one.
+ auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {
+ SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+ for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+ if (ExtMask[I] == UndefMaskElem)
+ continue;
+ NewMask[I] = Mask[ExtMask[I]];
+ }
+ Mask.swap(NewMask);
+ };
+ // Peek through shuffles, trying to simplify the final shuffle code.
+ auto &&PeekThroughShuffles =
+ [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,
+ bool CheckForLengthChange = false) {
+ while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ // Exit if not a fixed vector type or changing size shuffle.
+ if (!isa<FixedVectorType>(SV->getType()) ||
+ (CheckForLengthChange && SV->changesLength()))
+ break;
+ // Exit if the identity or broadcast mask is found.
+ if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||
+ SV->isZeroEltSplat())
+ break;
+ bool IsOp1Undef = isUndefVector(SV->getOperand(0));
+ bool IsOp2Undef = isUndefVector(SV->getOperand(1));
+ if (!IsOp1Undef && !IsOp2Undef)
+ break;
+ SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
+ SV->getShuffleMask().end());
+ CombineMasks(ShuffleMask, Mask);
+ Mask.swap(ShuffleMask);
+ if (IsOp2Undef)
+ V = SV->getOperand(0);
+ else
+ V = SV->getOperand(1);
+ }
+ };
+ // Smart shuffle instruction emission, walks through shuffles trees and
+ // tries to find the best matching vector for the actual shuffle
+ // instruction.
+ auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,
+ &CombineMasks](Value *V1, Value *V2,
+ ArrayRef<int> Mask) -> Value * {
+ assert(V1 && "Expected at least one vector value.");
+ if (V2 && !isUndefVector(V2)) {
+ // Peek through shuffles.
+ Value *Op1 = V1;
+ Value *Op2 = V2;
+ int VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+ SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (Mask[I] < VF)
+ CombinedMask1[I] = Mask[I];
+ else
+ CombinedMask2[I] = Mask[I] - VF;
+ }
+ Value *PrevOp1;
+ Value *PrevOp2;
+ do {
+ PrevOp1 = Op1;
+ PrevOp2 = Op2;
+ PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);
+ PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);
+ // Check if we have 2 resizing shuffles - need to peek through operands
+ // again.
+ if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
+ if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))
+ if (SV1->getOperand(0)->getType() ==
+ SV2->getOperand(0)->getType() &&
+ SV1->getOperand(0)->getType() != SV1->getType() &&
+ isUndefVector(SV1->getOperand(1)) &&
+ isUndefVector(SV2->getOperand(1))) {
+ Op1 = SV1->getOperand(0);
+ Op2 = SV2->getOperand(0);
+ SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
+ SV1->getShuffleMask().end());
+ CombineMasks(ShuffleMask1, CombinedMask1);
+ CombinedMask1.swap(ShuffleMask1);
+ SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
+ SV2->getShuffleMask().end());
+ CombineMasks(ShuffleMask2, CombinedMask2);
+ CombinedMask2.swap(ShuffleMask2);
+ }
+ } while (PrevOp1 != Op1 || PrevOp2 != Op2);
+ VF = cast<VectorType>(Op1->getType())
+ ->getElementCount()
+ .getKnownMinValue();
+ for (int I = 0, E = Mask.size(); I < E; ++I) {
+ if (CombinedMask2[I] != UndefMaskElem) {
+ assert(CombinedMask1[I] == UndefMaskElem &&
+ "Expected undefined mask element");
+ CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
+ }
+ }
+ Value *Vec = Builder.CreateShuffleVector(
+ Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
+ CombinedMask1);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
+ }
+ if (isa<PoisonValue>(V1))
+ return PoisonValue::get(FixedVectorType::get(
+ cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+ Value *Op = V1;
+ SmallVector<int> CombinedMask(Mask.begin(), Mask.end());
+ PeekThroughShuffles(Op, CombinedMask);
+ if (!isa<FixedVectorType>(Op->getType()) ||
+ !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {
+ Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
+ }
+ return Op;
+ };
+
+ auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) {
+ unsigned VF = Mask.size();
+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (VF != VecVF) {
+ if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
+ Vec = CreateShuffle(Vec, nullptr, Mask);
+ return std::make_pair(Vec, true);
+ }
+ SmallVector<int> ResizeMask(VF, UndefMaskElem);
+ for (unsigned I = 0; I < VF; ++I) {
+ if (Mask[I] != UndefMaskElem)
+ ResizeMask[Mask[I]] = Mask[I];
+ }
+ Vec = CreateShuffle(Vec, nullptr, ResizeMask);
+ }
+
+ return std::make_pair(Vec, false);
+ };
+ // Perform shuffling of the vectorize tree entries for better handling of
+ // external extracts.
+ for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
+ // Find the first and the last instruction in the list of insertelements.
+ sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
+ InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
+ InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
+ Builder.SetInsertPoint(LastInsert);
+ auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
+ Value *NewInst = performExtractsShuffleAction<Value>(
+ makeMutableArrayRef(Vector.data(), Vector.size()),
+ FirstInsert->getOperand(0),
+ [](Value *Vec) {
+ return cast<VectorType>(Vec->getType())
+ ->getElementCount()
+ .getKnownMinValue();
+ },
+ ResizeToVF,
+ [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
+ ArrayRef<Value *> Vals) {
+ assert((Vals.size() == 1 || Vals.size() == 2) &&
+ "Expected exactly 1 or 2 input values.");
+ if (Vals.size() == 1) {
+ // Do not create shuffle if the mask is a simple identity
+ // non-resizing mask.
+ if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
+ ->getNumElements() ||
+ !ShuffleVectorInst::isIdentityMask(Mask))
+ return CreateShuffle(Vals.front(), nullptr, Mask);
+ return Vals.front();
+ }
+ return CreateShuffle(Vals.front() ? Vals.front()
+ : FirstInsert->getOperand(0),
+ Vals.back(), Mask);
+ });
+ auto It = ShuffledInserts[I].InsertElements.rbegin();
+ // Rebuild buildvector chain.
+ InsertElementInst *II = nullptr;
+ if (It != ShuffledInserts[I].InsertElements.rend())
+ II = *It;
+ SmallVector<Instruction *> Inserts;
+ while (It != ShuffledInserts[I].InsertElements.rend()) {
+ assert(II && "Must be an insertelement instruction.");
+ if (*It == II)
+ ++It;
+ else
+ Inserts.push_back(cast<Instruction>(II));
+ II = dyn_cast<InsertElementInst>(II->getOperand(0));
+ }
+ for (Instruction *II : reverse(Inserts)) {
+ II->replaceUsesOfWith(II->getOperand(0), NewInst);
+ if (auto *NewI = dyn_cast<Instruction>(NewInst))
+ if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
+ II->moveAfter(NewI);
+ NewInst = II;
+ }
+ LastInsert->replaceAllUsesWith(NewInst);
+ for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
+ IE->replaceUsesOfWith(IE->getOperand(0),
+ PoisonValue::get(IE->getOperand(0)->getType()));
+ IE->replaceUsesOfWith(IE->getOperand(1),
+ PoisonValue::get(IE->getOperand(1)->getType()));
+ eraseInstruction(IE);
+ }
+ CSEBlocks.insert(LastInsert->getParent());
+ }
+
// For each vectorized value:
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
@@ -7050,6 +8850,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ if (Entry->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(Scalar))
+ continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
@@ -7057,7 +8860,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to delete users in the ignorelist.
- assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||
+ assert((getTreeEntry(U) ||
+ (UserIgnoreList && UserIgnoreList->contains(U)) ||
(isa_and_nonnull<Instruction>(U) &&
isDeleted(cast<Instruction>(U)))) &&
"Deleting out-of-tree value");
@@ -7225,9 +9029,11 @@ void BoUpSLP::optimizeGatherSequence() {
BoUpSLP::ScheduleData *
BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
- ScheduleData *Bundle = nullptr;
+ ScheduleData *Bundle = nullptr;
ScheduleData *PrevInBundle = nullptr;
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member "
@@ -7239,8 +9045,6 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
} else {
Bundle = BundleMember;
}
- BundleMember->UnscheduledDepsInBundle = 0;
- Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
@@ -7257,7 +9061,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
- if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))
+ if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
+ doesNotNeedToSchedule(VL))
return nullptr;
// Initialize the instruction bundle.
@@ -7276,16 +9081,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
ReSchedule = true;
}
- if (ReSchedule) {
- resetSchedule();
- initialFillReadyList(ReadyInsts);
- }
if (Bundle) {
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
<< " in block " << BB->getName() << "\n");
calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
}
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+
// Now try to schedule the new bundle or (if no bundle) just calculate
// dependencies. As soon as the bundle is "ready" it means that there are no
// cyclic dependencies and we can schedule it. Note that's important that we
@@ -7293,14 +9099,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
!ReadyInsts.empty()) {
ScheduleData *Picked = ReadyInsts.pop_back_val();
- if (Picked->isSchedulingEntity() && Picked->isReady())
- schedule(Picked, ReadyInsts);
+ assert(Picked->isSchedulingEntity() && Picked->isReady() &&
+ "must be ready to schedule");
+ schedule(Picked, ReadyInsts);
}
};
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
if (!extendSchedulingRegion(V, S)) {
// If the scheduling region got new instructions at the lower end (or it
// is a new region for the first bundle). This makes it necessary to
@@ -7315,9 +9124,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V))
+ continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
+
+ // Make sure we don't leave the pieces of the bundle in the ready list when
+ // whole bundle might not be ready.
+ ReadyInsts.remove(BundleMember);
+
if (!BundleMember->IsScheduled)
continue;
// A bundle member was scheduled as single instruction before and now
@@ -7339,16 +9155,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
Value *OpValue) {
- if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))
+ if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
+ doesNotNeedToSchedule(VL))
return;
+ if (doesNotNeedToBeScheduled(OpValue))
+ OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
ScheduleData *Bundle = getScheduleData(OpValue);
LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
- assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+ assert(Bundle->isSchedulingEntity() &&
+ (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
"tried to unbundle something which is not a bundle");
+ // Remove the bundle from the ready list.
+ if (Bundle->isReady())
+ ReadyInsts.remove(Bundle);
+
// Un-bundle: make single instructions out of the bundle.
ScheduleData *BundleMember = Bundle;
while (BundleMember) {
@@ -7356,8 +9180,8 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
BundleMember->FirstInBundle = BundleMember;
ScheduleData *Next = BundleMember->NextInBundle;
BundleMember->NextInBundle = nullptr;
- BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
- if (BundleMember->UnscheduledDepsInBundle == 0) {
+ BundleMember->TE = nullptr;
+ if (BundleMember->unscheduledDepsInBundle() == 0) {
ReadyInsts.insert(BundleMember);
}
BundleMember = Next;
@@ -7380,9 +9204,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
+ !doesNotNeedToBeScheduled(I) &&
"phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled");
- auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
+ auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
return false;
@@ -7394,7 +9219,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
ExtraScheduleDataMap[I][S.OpValue] = SD;
return true;
};
- if (CheckSheduleForI(I))
+ if (CheckScheduleForI(I))
return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
@@ -7402,7 +9227,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
@@ -7430,7 +9255,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
+ CheckScheduleForI(I);
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
<< "\n");
return true;
@@ -7444,7 +9269,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
nullptr);
ScheduleEnd = I->getNextNode();
if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
@@ -7456,7 +9281,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
ScheduleData *NextLoadStore) {
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
- ScheduleData *SD = ScheduleDataMap[I];
+ // No need to allocate data for non-schedulable instructions.
+ if (doesNotNeedToBeScheduled(I))
+ continue;
+ ScheduleData *SD = ScheduleDataMap.lookup(I);
if (!SD) {
SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
@@ -7479,6 +9307,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
}
CurrentLoadStore = SD;
}
+
+ if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+ match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+ RegionHasStackSave = true;
}
if (NextLoadStore) {
if (CurrentLoadStore)
@@ -7511,8 +9343,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// Handle def-use chain dependencies.
if (BundleMember->OpValue != BundleMember->Inst) {
- ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled)
@@ -7522,10 +9353,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
} else {
for (User *U : BundleMember->Inst->users()) {
- assert(isa<Instruction>(U) &&
- "user of instruction must be instruction");
- ScheduleData *UseSD = getScheduleData(U);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled)
@@ -7536,6 +9364,75 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
}
+ auto makeControlDependent = [&](Instruction *I) {
+ auto *DepDest = getScheduleData(I);
+ assert(DepDest && "must be in schedule window");
+ DepDest->ControlDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ };
+
+ // Any instruction which isn't safe to speculate at the begining of the
+ // block is control dependend on any early exit or non-willreturn call
+ // which proceeds it.
+ if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
+ for (Instruction *I = BundleMember->Inst->getNextNode();
+ I != ScheduleEnd; I = I->getNextNode()) {
+ if (isSafeToSpeculativelyExecute(I, &*BB->begin()))
+ continue;
+
+ // Add the dependency
+ makeControlDependent(I);
+
+ if (!isGuaranteedToTransferExecutionToSuccessor(I))
+ // Everything past here must be control dependent on I.
+ break;
+ }
+ }
+
+ if (RegionHasStackSave) {
+ // If we have an inalloc alloca instruction, it needs to be scheduled
+ // after any preceeding stacksave. We also need to prevent any alloca
+ // from reordering above a preceeding stackrestore.
+ if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
+ match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
+ for (Instruction *I = BundleMember->Inst->getNextNode();
+ I != ScheduleEnd; I = I->getNextNode()) {
+ if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+ match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+ // Any allocas past here must be control dependent on I, and I
+ // must be memory dependend on BundleMember->Inst.
+ break;
+
+ if (!isa<AllocaInst>(I))
+ continue;
+
+ // Add the dependency
+ makeControlDependent(I);
+ }
+ }
+
+ // In addition to the cases handle just above, we need to prevent
+ // allocas from moving below a stacksave. The stackrestore case
+ // is currently thought to be conservatism.
+ if (isa<AllocaInst>(BundleMember->Inst)) {
+ for (Instruction *I = BundleMember->Inst->getNextNode();
+ I != ScheduleEnd; I = I->getNextNode()) {
+ if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
+ !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+ continue;
+
+ // Add the dependency
+ makeControlDependent(I);
+ break;
+ }
+ }
+ }
+
// Handle the memory dependencies (if any).
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (!DepDest)
@@ -7598,7 +9495,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
}
if (InsertInReadyList && SD->isReady()) {
- ReadyInsts.push_back(SD);
+ ReadyInsts.insert(SD);
LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
<< "\n");
}
@@ -7625,11 +9522,18 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+ // A key point - if we got here, pre-scheduling was able to find a valid
+ // scheduling of the sub-graph of the scheduling window which consists
+ // of all vector bundles and their transitive users. As such, we do not
+ // need to reschedule anything *outside of* that subgraph.
+
BS->resetSchedule();
// For the real scheduling we use a more sophisticated ready-list: it is
// sorted by the original instruction location. This lets the final schedule
// be as close as possible to the original instruction order.
+ // WARNING: If changing this order causes a correctness issue, that means
+ // there is some missing dependence edge in the schedule data graph.
struct ScheduleDataCompare {
bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
return SD2->SchedulingPriority < SD1->SchedulingPriority;
@@ -7637,21 +9541,22 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
- // Ensure that all dependency data is updated and fill the ready-list with
- // initial instructions.
+ // Ensure that all dependency data is updated (for nodes in the sub-graph)
+ // and fill the ready-list with initial instructions.
int Idx = 0;
- int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+ BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+ TreeEntry *SDTE = getTreeEntry(SD->Inst);
+ (void)SDTE;
assert((isVectorLikeInstWithConstOps(SD->Inst) ||
- SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
+ SD->isPartOfBundle() ==
+ (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
- if (SD->isSchedulingEntity()) {
+
+ if (SD->isSchedulingEntity() && SD->isPartOfBundle())
BS->calculateDependencies(SD, false, this);
- NumToSchedule++;
- }
});
}
BS->initialFillReadyList(ReadyInsts);
@@ -7674,9 +9579,23 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}
BS->schedule(picked, ReadyInsts);
- NumToSchedule--;
}
- assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+ // Check that we didn't break any of our invariants.
+#ifdef EXPENSIVE_CHECKS
+ BS->verify();
+#endif
+
+#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
+ // Check that all schedulable entities got scheduled
+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
+ BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
+ assert(SD->IsScheduled && "must be scheduled at this point");
+ }
+ });
+ }
+#endif
// Avoid duplicate scheduling of the block.
BS->ScheduleStart = nullptr;
@@ -7686,11 +9605,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value (or value
// truncated just before storing) without traversing the expression tree.
// This is the common case.
- if (auto *Store = dyn_cast<StoreInst>(V)) {
- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
- return DL->getTypeSizeInBits(Trunc->getSrcTy());
+ if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
- }
if (auto *IEI = dyn_cast<InsertElementInst>(V))
return getVectorElementSize(IEI->getOperand(1));
@@ -8092,6 +10008,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
+ // Start new block - clear the list of reduction roots.
+ R.clearReductionData();
collectSeedInstructions(BB);
// Vectorize trees that end at stores.
@@ -8122,11 +10040,10 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
}
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
- unsigned Idx) {
+ unsigned Idx, unsigned MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);
- const unsigned MinVF = R.getMinVecRegSize() / Sz;
unsigned VF = Chain.size();
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
@@ -8265,9 +10182,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned EltSize = R.getVectorElementSize(Operands[0]);
unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
- unsigned MinVF = R.getMinVF(EltSize);
unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
MaxElts);
+ auto *Store = cast<StoreInst>(Operands[0]);
+ Type *StoreTy = Store->getValueOperand()->getType();
+ Type *ValueTy = StoreTy;
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ ValueTy = Trunc->getSrcTy();
+ unsigned MinVF = TTI->getStoreMinimumVF(
+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
@@ -8277,7 +10200,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
- vectorizeStoreChain(Slice, R, Cnt)) {
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;
@@ -8481,7 +10404,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!I)
return false;
- if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
+ isa<VectorType>(I->getType()))
return false;
Value *P = I->getParent();
@@ -8492,32 +10416,40 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
return false;
- // Try to vectorize V.
- if (tryToVectorizePair(Op0, Op1, R))
- return true;
+ // First collect all possible candidates
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Op0, Op1);
auto *A = dyn_cast<BinaryOperator>(Op0);
auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
- if (B && B->hasOneUse()) {
+ if (A && B && B->hasOneUse()) {
auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
- return true;
- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
- return true;
+ if (B0 && B0->getParent() == P)
+ Candidates.emplace_back(A, B0);
+ if (B1 && B1->getParent() == P)
+ Candidates.emplace_back(A, B1);
}
-
// Try to skip A.
- if (A && A->hasOneUse()) {
+ if (B && A && A->hasOneUse()) {
auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
- return true;
- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
- return true;
+ if (A0 && A0->getParent() == P)
+ Candidates.emplace_back(A0, B);
+ if (A1 && A1->getParent() == P)
+ Candidates.emplace_back(A1, B);
}
- return false;
+
+ if (Candidates.size() == 1)
+ return tryToVectorizePair(Op0, Op1, R);
+
+ // We have multiple options. Try to pick the single best.
+ Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ if (!BestCandidate)
+ return false;
+ return tryToVectorizePair(Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second, R);
}
namespace {
@@ -8552,15 +10484,16 @@ class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
ReductionOpsListType ReductionOps;
- SmallVector<Value *, 32> ReducedVals;
+ /// List of possibly reduced values.
+ SmallVector<SmallVector<Value *>> ReducedVals;
+ /// Maps reduced value to the corresponding reduction operation.
+ DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
- const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
-
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -8604,26 +10537,6 @@ class HorizontalReduction {
return I->getOperand(Index);
}
- /// Checks if the ParentStackElem.first should be marked as a reduction
- /// operation with an extra argument or as extra argument itself.
- void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
- Value *ExtraArg) {
- if (ExtraArgs.count(ParentStackElem.first)) {
- ExtraArgs[ParentStackElem.first] = nullptr;
- // We ran into something like:
- // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
- // The whole ParentStackElem.first should be considered as an extra value
- // in this case.
- // Do not perform analysis of remaining operands of ParentStackElem.first
- // instruction, this whole instruction is an extra argument.
- ParentStackElem.second = INVALID_OPERAND_INDEX;
- } else {
- // We ran into something like:
- // ParentStackElem.first += ... + ExtraArg + ...
- ExtraArgs[ParentStackElem.first] = ExtraArg;
- }
- }
-
/// Creates reduction operation with the current opcode.
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
@@ -8682,7 +10595,7 @@ class HorizontalReduction {
}
/// Creates reduction operation with the current opcode with the IR flags
- /// from \p ReductionOps.
+ /// from \p ReductionOps, dropping nuw/nsw flags.
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name,
const ReductionOpsListType &ReductionOps) {
@@ -8696,31 +10609,21 @@ class HorizontalReduction {
Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
if (auto *Sel = dyn_cast<SelectInst>(Op)) {
- propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
- propagateIRFlags(Op, ReductionOps[1]);
+ propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
+ /*IncludeWrapFlags=*/false);
+ propagateIRFlags(Op, ReductionOps[1], nullptr,
+ /*IncludeWrapFlags=*/false);
return Op;
}
}
- propagateIRFlags(Op, ReductionOps[0]);
- return Op;
- }
-
- /// Creates reduction operation with the current opcode with the IR flags
- /// from \p I.
- static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
- Value *RHS, const Twine &Name, Instruction *I) {
- auto *SelI = dyn_cast<SelectInst>(I);
- Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
- if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
- if (auto *Sel = dyn_cast<SelectInst>(Op))
- propagateIRFlags(Sel->getCondition(), SelI->getCondition());
- }
- propagateIRFlags(Op, I);
+ propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
return Op;
}
- static RecurKind getRdxKind(Instruction *I) {
- assert(I && "Expected instruction for reduction matching");
+ static RecurKind getRdxKind(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return RecurKind::None;
if (match(I, m_Add(m_Value(), m_Value())))
return RecurKind::Add;
if (match(I, m_Mul(m_Value(), m_Value())))
@@ -8882,7 +10785,9 @@ public:
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+ ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
assert((!Phi || is_contained(Phi->operands(), Inst)) &&
"Phi needs to use the binary operator");
assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -8926,124 +10831,178 @@ public:
ReductionRoot = Inst;
- // The opcode for leaf values that we perform a reduction on.
- // For example: load(x) + load(y) + load(z) + fptoui(w)
- // The leaf opcode for 'w' does not match, so we don't include it as a
- // potential candidate for the reduction.
- unsigned LeafOpcode = 0;
-
- // Post-order traverse the reduction tree starting at Inst. We only handle
- // true trees containing binary operators or selects.
- SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
- initReductionOps(Inst);
- while (!Stack.empty()) {
- Instruction *TreeN = Stack.back().first;
- unsigned EdgeToVisit = Stack.back().second++;
- const RecurKind TreeRdxKind = getRdxKind(TreeN);
- bool IsReducedValue = TreeRdxKind != RdxKind;
-
- // Postorder visit.
- if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
- if (IsReducedValue)
- ReducedVals.push_back(TreeN);
- else {
- auto ExtraArgsIter = ExtraArgs.find(TreeN);
- if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
- // Check if TreeN is an extra argument of its parent operation.
- if (Stack.size() <= 1) {
- // TreeN can't be an extra argument as it is a root reduction
- // operation.
- return false;
- }
- // Yes, TreeN is an extra argument, do not add it to a list of
- // reduction operations.
- // Stack[Stack.size() - 2] always points to the parent operation.
- markExtraArg(Stack[Stack.size() - 2], TreeN);
- ExtraArgs.erase(TreeN);
- } else
- addReductionOps(TreeN);
- }
- // Retract.
- Stack.pop_back();
- continue;
- }
-
- // Visit operands.
- Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
- if (!EdgeInst) {
- // Edge value is not a reduction instruction or a leaf instruction.
- // (It may be a constant, function argument, or something else.)
- markExtraArg(Stack.back(), EdgeVal);
- continue;
+ // Iterate through all the operands of the possible reduction tree and
+ // gather all the reduced values, sorting them by their value id.
+ BasicBlock *BB = Inst->getParent();
+ bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
+ SmallVector<Instruction *> Worklist(1, Inst);
+ // Checks if the operands of the \p TreeN instruction are also reduction
+ // operations or should be treated as reduced values or an extra argument,
+ // which is not part of the reduction.
+ auto &&CheckOperands = [this, IsCmpSelMinMax,
+ BB](Instruction *TreeN,
+ SmallVectorImpl<Value *> &ExtraArgs,
+ SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps) {
+ for (int I = getFirstOperandIndex(TreeN),
+ End = getNumberOfOperands(TreeN);
+ I < End; ++I) {
+ Value *EdgeVal = getRdxOperand(TreeN, I);
+ ReducedValsToOps[EdgeVal].push_back(TreeN);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ // Edge has wrong parent - mark as an extra argument.
+ if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
+ !hasSameParent(EdgeInst, BB)) {
+ ExtraArgs.push_back(EdgeVal);
+ continue;
+ }
+ // If the edge is not an instruction, or it is different from the main
+ // reduction opcode or has too many uses - possible reduced value.
+ if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+ IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+ !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+ !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+ PossibleReducedVals.push_back(EdgeVal);
+ continue;
+ }
+ ReductionOps.push_back(EdgeInst);
}
- RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
- // Continue analysis if the next operand is a reduction operation or
- // (possibly) a leaf value. If the leaf value opcode is not set,
- // the first met operation != reduction operation is considered as the
- // leaf opcode.
- // Only handle trees in the current basic block.
- // Each tree node needs to have minimal number of users except for the
- // ultimate reduction.
- const bool IsRdxInst = EdgeRdxKind == RdxKind;
- if (EdgeInst != Phi && EdgeInst != Inst &&
- hasSameParent(EdgeInst, Inst->getParent()) &&
- hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
- (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
- if (IsRdxInst) {
- // We need to be able to reassociate the reduction operations.
- if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), EdgeInst);
- continue;
- }
- } else if (!LeafOpcode) {
- LeafOpcode = EdgeInst->getOpcode();
+ };
+ // Try to regroup reduced values so that it gets more profitable to try to
+ // reduce them. Values are grouped by their value ids, instructions - by
+ // instruction op id and/or alternate op id, plus do extra analysis for
+ // loads (grouping them by the distabce between pointers) and cmp
+ // instructions (grouping them by the predicate).
+ MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+ PossibleReducedVals;
+ initReductionOps(Inst);
+ while (!Worklist.empty()) {
+ Instruction *TreeN = Worklist.pop_back_val();
+ SmallVector<Value *> Args;
+ SmallVector<Value *> PossibleRedVals;
+ SmallVector<Instruction *> PossibleReductionOps;
+ CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
+ // If too many extra args - mark the instruction itself as a reduction
+ // value, not a reduction operation.
+ if (Args.size() < 2) {
+ addReductionOps(TreeN);
+ // Add extra args.
+ if (!Args.empty()) {
+ assert(Args.size() == 1 && "Expected only single argument.");
+ ExtraArgs[TreeN] = Args.front();
}
- Stack.push_back(
- std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
- continue;
+ // Add reduction values. The values are sorted for better vectorization
+ // results.
+ for (Value *V : PossibleRedVals) {
+ size_t Key, Idx;
+ std::tie(Key, Idx) = generateKeySubkey(
+ V, &TLI,
+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+ auto It = PossibleReducedVals.find(Key);
+ if (It != PossibleReducedVals.end()) {
+ for (const auto &LoadData : It->second) {
+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+ if (getPointersDiff(RLI->getType(),
+ RLI->getPointerOperand(), LI->getType(),
+ LI->getPointerOperand(), DL, SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ }
+ return hash_value(LI->getPointerOperand());
+ },
+ /*AllowAlternate=*/false);
+ ++PossibleReducedVals[Key][Idx]
+ .insert(std::make_pair(V, 0))
+ .first->second;
+ }
+ Worklist.append(PossibleReductionOps.rbegin(),
+ PossibleReductionOps.rend());
+ } else {
+ size_t Key, Idx;
+ std::tie(Key, Idx) = generateKeySubkey(
+ TreeN, &TLI,
+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+ auto It = PossibleReducedVals.find(Key);
+ if (It != PossibleReducedVals.end()) {
+ for (const auto &LoadData : It->second) {
+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(),
+ DL, SE, /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ }
+ return hash_value(LI->getPointerOperand());
+ },
+ /*AllowAlternate=*/false);
+ ++PossibleReducedVals[Key][Idx]
+ .insert(std::make_pair(TreeN, 0))
+ .first->second;
+ }
+ }
+ auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
+ // Sort values by the total number of values kinds to start the reduction
+ // from the longest possible reduced values sequences.
+ for (auto &PossibleReducedVals : PossibleReducedValsVect) {
+ auto PossibleRedVals = PossibleReducedVals.second.takeVector();
+ SmallVector<SmallVector<Value *>> PossibleRedValsVect;
+ for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
+ It != E; ++It) {
+ PossibleRedValsVect.emplace_back();
+ auto RedValsVect = It->second.takeVector();
+ stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.second < P2.second;
+ });
+ for (const std::pair<Value *, unsigned> &Data : RedValsVect)
+ PossibleRedValsVect.back().append(Data.second, Data.first);
}
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), EdgeInst);
- }
+ stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.size() > P2.size();
+ });
+ ReducedVals.emplace_back();
+ for (ArrayRef<Value *> Data : PossibleRedValsVect)
+ ReducedVals.back().append(Data.rbegin(), Data.rend());
+ }
+ // Sort the reduced values by number of same/alternate opcode and/or pointer
+ // operand.
+ stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+ return P1.size() > P2.size();
+ });
return true;
}
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ constexpr int ReductionLimit = 4;
+ constexpr unsigned RegMaxNumber = 4;
+ constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- unsigned NumReducedVals = ReducedVals.size();
- if (NumReducedVals < 4)
+ unsigned NumReducedVals = std::accumulate(
+ ReducedVals.begin(), ReducedVals.end(), 0,
+ [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+ if (NumReducedVals < ReductionLimit)
return nullptr;
- // Intersect the fast-math-flags from all reduction operations.
- FastMathFlags RdxFMF;
- RdxFMF.set();
- for (ReductionOpsType &RdxOp : ReductionOps) {
- for (Value *RdxVal : RdxOp) {
- if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
- RdxFMF &= FPMO->getFastMathFlags();
- }
- }
-
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
- Builder.setFastMathFlags(RdxFMF);
+ // Track the reduced values in case if they are replaced by extractelement
+ // because of the vectorization.
+ DenseMap<Value *, WeakTrackingVH> TrackedVals;
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several times, so log each attempt
// to use it.
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ TrackedVals.try_emplace(Pair.second, Pair.second);
}
// The compare instruction of a min/max is the insertion point for new
// instructions and may be replaced with a new compare instruction.
- auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+ auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
assert(isa<SelectInst>(RdxRootInst) &&
"Expected min/max reduction to have select root instruction");
Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9055,164 +11014,390 @@ public:
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
- SmallVector<Value *, 16> IgnoreList;
- for (ReductionOpsType &RdxOp : ReductionOps)
- IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
- if (NumReducedVals > ReduxWidth) {
- // In the loop below, we are building a tree based on a window of
- // 'ReduxWidth' values.
- // If the operands of those values have common traits (compare predicate,
- // constant operand, etc), then we want to group those together to
- // minimize the cost of the reduction.
-
- // TODO: This should be extended to count common operands for
- // compares and binops.
-
- // Step 1: Count the number of times each compare predicate occurs.
- SmallDenseMap<unsigned, unsigned> PredCountMap;
- for (Value *RdxVal : ReducedVals) {
- CmpInst::Predicate Pred;
- if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
- ++PredCountMap[Pred];
- }
- // Step 2: Sort the values so the most common predicates come first.
- stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
- CmpInst::Predicate PredA, PredB;
- if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
- match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
- return PredCountMap[PredA] > PredCountMap[PredB];
- }
- return false;
- });
- }
+ SmallDenseSet<Value *> IgnoreList;
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps) {
+ if (!RdxOp)
+ continue;
+ IgnoreList.insert(RdxOp);
+ }
+ bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
+
+ // Need to track reduced vals, they may be changed during vectorization of
+ // subvectors.
+ for (ArrayRef<Value *> Candidates : ReducedVals)
+ for (Value *V : Candidates)
+ TrackedVals.try_emplace(V, V);
+ DenseMap<Value *, unsigned> VectorizedVals;
Value *VectorizedTree = nullptr;
- unsigned i = 0;
- while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
- ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, IgnoreList);
- if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
- break;
- if (V.isLoadCombineReductionCandidate(RdxKind))
- break;
- V.reorderTopToBottom();
- V.reorderBottomToTop(/*IgnoreReorder=*/true);
- V.buildExternalUses(ExternallyUsedValues);
-
- // For a poison-safe boolean logic reduction, do not replace select
- // instructions with logic ops. All reduced values will be frozen (see
- // below) to prevent leaking poison.
- if (isa<SelectInst>(ReductionRoot) &&
- isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
- NumReducedVals != ReduxWidth)
- break;
+ bool CheckForReusedReductionOps = false;
+ // Try to vectorize elements based on their type.
+ for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+ ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
+ InstructionsState S = getSameOpcode(OrigReducedVals);
+ SmallVector<Value *> Candidates;
+ DenseMap<Value *, Value *> TrackedToOrig;
+ for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
+ Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+ // Check if the reduction value was not overriden by the extractelement
+ // instruction because of the vectorization and exclude it, if it is not
+ // compatible with other values.
+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+ if (isVectorLikeInstWithConstOps(Inst) &&
+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
+ continue;
+ Candidates.push_back(RdxVal);
+ TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+ }
+ bool ShuffledExtracts = false;
+ // Try to handle shuffled extractelements.
+ if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
+ I + 1 < E) {
+ InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+ if (NextS.getOpcode() == Instruction::ExtractElement &&
+ !NextS.isAltShuffle()) {
+ SmallVector<Value *> CommonCandidates(Candidates);
+ for (Value *RV : ReducedVals[I + 1]) {
+ Value *RdxVal = TrackedVals.find(RV)->second;
+ // Check if the reduction value was not overriden by the
+ // extractelement instruction because of the vectorization and
+ // exclude it, if it is not compatible with other values.
+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+ if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
+ continue;
+ CommonCandidates.push_back(RdxVal);
+ TrackedToOrig.try_emplace(RdxVal, RV);
+ }
+ SmallVector<int> Mask;
+ if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+ ++I;
+ Candidates.swap(CommonCandidates);
+ ShuffledExtracts = true;
+ }
+ }
+ }
+ unsigned NumReducedVals = Candidates.size();
+ if (NumReducedVals < ReductionLimit)
+ continue;
- V.computeMinimumValueSizes();
+ unsigned MaxVecRegSize = V.getMaxVecRegSize();
+ unsigned EltSize = V.getVectorElementSize(Candidates[0]);
+ unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);
+
+ unsigned ReduxWidth = std::min<unsigned>(
+ PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
+ unsigned Start = 0;
+ unsigned Pos = Start;
+ // Restarts vectorization attempt with lower vector factor.
+ unsigned PrevReduxWidth = ReduxWidth;
+ bool CheckForReusedReductionOpsLocal = false;
+ auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+ &CheckForReusedReductionOpsLocal,
+ &PrevReduxWidth, &V,
+ &IgnoreList](bool IgnoreVL = false) {
+ bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
+ if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
+ // Check if any of the reduction ops are gathered. If so, worth
+ // trying again with less number of reduction ops.
+ CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
+ }
+ ++Pos;
+ if (Pos < NumReducedVals - ReduxWidth + 1)
+ return IsAnyRedOpGathered;
+ Pos = Start;
+ ReduxWidth /= 2;
+ return IsAnyRedOpGathered;
+ };
+ while (Pos < NumReducedVals - ReduxWidth + 1 &&
+ ReduxWidth >= ReductionLimit) {
+ // Dependency in tree of the reduction ops - drop this attempt, try
+ // later.
+ if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
+ Start == 0) {
+ CheckForReusedReductionOps = true;
+ break;
+ }
+ PrevReduxWidth = ReduxWidth;
+ ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
+ // Beeing analyzed already - skip.
+ if (V.areAnalyzedReductionVals(VL)) {
+ (void)AdjustReducedVals(/*IgnoreVL=*/true);
+ continue;
+ }
+ // Early exit if any of the reduction values were deleted during
+ // previous vectorization attempts.
+ if (any_of(VL, [&V](Value *RedVal) {
+ auto *RedValI = dyn_cast<Instruction>(RedVal);
+ if (!RedValI)
+ return false;
+ return V.isDeleted(RedValI);
+ }))
+ break;
+ V.buildTree(VL, IgnoreList);
+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
+ if (V.isLoadCombineReductionCandidate(RdxKind)) {
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
+ V.reorderTopToBottom();
+ // No need to reorder the root node at all.
+ V.reorderBottomToTop(/*IgnoreReorder=*/true);
+ // Keep extracted other reduction values, if they are used in the
+ // vectorization trees.
+ BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+ ExternallyUsedValues);
+ for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
+ if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
+ continue;
+ for_each(ReducedVals[Cnt],
+ [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
+ if (isa<Instruction>(V))
+ LocalExternallyUsedValues[TrackedVals[V]];
+ });
+ }
+ // Number of uses of the candidates in the vector of values.
+ SmallDenseMap<Value *, unsigned> NumUses;
+ for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (NumUses.count(V) > 0)
+ continue;
+ NumUses[V] = std::count(VL.begin(), VL.end(), V);
+ }
+ for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (NumUses.count(V) > 0)
+ continue;
+ NumUses[V] = std::count(VL.begin(), VL.end(), V);
+ }
+ // Gather externally used values.
+ SmallPtrSet<Value *, 4> Visited;
+ for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (!Visited.insert(V).second)
+ continue;
+ unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
+ if (NumOps != ReducedValsToOps.find(V)->second.size())
+ LocalExternallyUsedValues[V];
+ }
+ for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
+ Value *V = Candidates[Cnt];
+ if (!Visited.insert(V).second)
+ continue;
+ unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];
+ if (NumOps != ReducedValsToOps.find(V)->second.size())
+ LocalExternallyUsedValues[V];
+ }
+ V.buildExternalUses(LocalExternallyUsedValues);
+
+ V.computeMinimumValueSizes();
+
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (Value *U : IgnoreList)
+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+ RdxFMF &= FPMO->getFastMathFlags();
+ // Estimate cost.
+ InstructionCost TreeCost = V.getTreeCost(VL);
+ InstructionCost ReductionCost =
+ getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
+ InstructionCost Cost = TreeCost + ReductionCost;
+ if (!Cost.isValid()) {
+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ return nullptr;
+ }
+ if (Cost >= -SLPCostThreshold) {
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "HorSLPNotBeneficial",
+ ReducedValsToOps.find(VL[0])->second.front())
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost " << ore::NV("Cost", Cost)
+ << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
- // Estimate cost.
- InstructionCost TreeCost =
- V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
- InstructionCost ReductionCost =
- getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
- InstructionCost Cost = TreeCost + ReductionCost;
- if (!Cost.isValid()) {
- LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
- return nullptr;
- }
- if (Cost >= -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
- cast<Instruction>(VL[0]))
- << "Vectorizing horizontal reduction is possible"
- << "but not beneficial with cost " << ore::NV("Cost", Cost)
- << " and threshold "
- << ore::NV("Threshold", -SLPCostThreshold);
+ return OptimizationRemark(
+ SV_NAME, "VectorizedHorizontalReduction",
+ ReducedValsToOps.find(VL[0])->second.front())
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
});
- break;
- }
- LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
- << Cost << ". (HorRdx)\n");
- V.getORE()->emit([&]() {
- return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
- cast<Instruction>(VL[0]))
- << "Vectorized horizontal reduction with cost "
- << ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize());
- });
+ Builder.setFastMathFlags(RdxFMF);
- // Vectorize a tree.
- DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
- Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+ // Vectorize a tree.
+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
- // Emit a reduction. If the root is a select (min/max idiom), the insert
- // point is the compare condition of that select.
- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (isCmpSelMinMax(RdxRootInst))
- Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
- else
- Builder.SetInsertPoint(RdxRootInst);
+ // Emit a reduction. If the root is a select (min/max idiom), the insert
+ // point is the compare condition of that select.
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ if (IsCmpSelMinMax)
+ Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
+ else
+ Builder.SetInsertPoint(RdxRootInst);
- // To prevent poison from leaking across what used to be sequential, safe,
- // scalar boolean logic operations, the reduction operand must be frozen.
- if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
- VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+ // To prevent poison from leaking across what used to be sequential,
+ // safe, scalar boolean logic operations, the reduction operand must be
+ // frozen.
+ if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+ VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
- Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (!VectorizedTree) {
- // Initialize the final value in the reduction.
- VectorizedTree = ReducedSubTree;
- } else {
- // Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(Loc);
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- ReducedSubTree, "op.rdx", ReductionOps);
+ if (!VectorizedTree) {
+ // Initialize the final value in the reduction.
+ VectorizedTree = ReducedSubTree;
+ } else {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(
+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ ReducedSubTree, "op.rdx", ReductionOps);
+ }
+ // Count vectorized reduced values to exclude them from final reduction.
+ for (Value *V : VL)
+ ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+ .first->getSecond();
+ Pos += ReduxWidth;
+ Start = Pos;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
}
- i += ReduxWidth;
- ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
-
if (VectorizedTree) {
// Finish the reduction.
- for (; i < NumReducedVals; ++i) {
- auto *I = cast<Instruction>(ReducedVals[i]);
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree =
- createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+ // Need to add extra arguments and not vectorized possible reduction
+ // values.
+ // Try to avoid dependencies between the scalar remainders after
+ // reductions.
+ auto &&FinalGen =
+ [this, &Builder,
+ &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
+ unsigned Sz = InstVals.size();
+ SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
+ Sz % 2);
+ for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
+ Instruction *RedOp = InstVals[I + 1].first;
+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+ Value *RdxVal1 = InstVals[I].second;
+ Value *StableRdxVal1 = RdxVal1;
+ auto It1 = TrackedVals.find(RdxVal1);
+ if (It1 != TrackedVals.end())
+ StableRdxVal1 = It1->second;
+ Value *RdxVal2 = InstVals[I + 1].second;
+ Value *StableRdxVal2 = RdxVal2;
+ auto It2 = TrackedVals.find(RdxVal2);
+ if (It2 != TrackedVals.end())
+ StableRdxVal2 = It2->second;
+ Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
+ StableRdxVal2, "op.rdx", ReductionOps);
+ ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
+ }
+ if (Sz % 2 == 1)
+ ExtraReds[Sz / 2] = InstVals.back();
+ return ExtraReds;
+ };
+ SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
+ SmallPtrSet<Value *, 8> Visited;
+ for (ArrayRef<Value *> Candidates : ReducedVals) {
+ for (Value *RdxVal : Candidates) {
+ if (!Visited.insert(RdxVal).second)
+ continue;
+ unsigned NumOps = VectorizedVals.lookup(RdxVal);
+ for (Instruction *RedOp :
+ makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+ .drop_back(NumOps))
+ ExtraReductions.emplace_back(RedOp, RdxVal);
+ }
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
- for (auto *I : Pair.second) {
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- Pair.first, "op.extra", I);
- }
+ for (auto *I : Pair.second)
+ ExtraReductions.emplace_back(I, Pair.first);
+ }
+ // Iterate through all not-vectorized reduction values/extra arguments.
+ while (ExtraReductions.size() > 1) {
+ SmallVector<std::pair<Instruction *, Value *>> NewReds =
+ FinalGen(ExtraReductions);
+ ExtraReductions.swap(NewReds);
+ }
+ // Final reduction.
+ if (ExtraReductions.size() == 1) {
+ Instruction *RedOp = ExtraReductions.back().first;
+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+ Value *RdxVal = ExtraReductions.back().second;
+ Value *StableRdxVal = RdxVal;
+ auto It = TrackedVals.find(RdxVal);
+ if (It != TrackedVals.end())
+ StableRdxVal = It->second;
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ StableRdxVal, "op.rdx", ReductionOps);
}
ReductionRoot->replaceAllUsesWith(VectorizedTree);
- // Mark all scalar reduction ops for deletion, they are replaced by the
- // vector reductions.
- V.eraseInstructions(IgnoreList);
+ // The original scalar reduction is expected to have no remaining
+ // uses outside the reduction tree itself. Assert that we got this
+ // correct, replace internal uses with undef, and mark for eventual
+ // deletion.
+#ifndef NDEBUG
+ SmallSet<Value *, 4> IgnoreSet;
+ for (ArrayRef<Value *> RdxOps : ReductionOps)
+ IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
+#endif
+ for (ArrayRef<Value *> RdxOps : ReductionOps) {
+ for (Value *Ignore : RdxOps) {
+ if (!Ignore)
+ continue;
+#ifndef NDEBUG
+ for (auto *U : Ignore->users()) {
+ assert(IgnoreSet.count(U) &&
+ "All users must be either in the reduction ops list.");
+ }
+#endif
+ if (!Ignore->use_empty()) {
+ Value *Undef = UndefValue::get(Ignore->getType());
+ Ignore->replaceAllUsesWith(Undef);
+ }
+ V.eraseInstruction(cast<Instruction>(Ignore));
+ }
+ }
+ } else if (!CheckForReusedReductionOps) {
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps)
+ V.analyzedReductionRoot(cast<Instruction>(RdxOp));
}
return VectorizedTree;
}
- unsigned numReductionValues() const { return ReducedVals.size(); }
-
private:
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
- Value *FirstReducedVal, unsigned ReduxWidth,
- FastMathFlags FMF) {
+ ArrayRef<Value *> ReducedVals,
+ unsigned ReduxWidth, FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
- InstructionCost VectorCost, ScalarCost;
+ InstructionCost VectorCost = 0, ScalarCost;
+ // If all of the reduced values are constant, the vector cost is 0, since
+ // the reduction value can be calculated at the compile time.
+ bool AllConsts = all_of(ReducedVals, isConstant);
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@@ -9222,17 +11407,22 @@ private:
case RecurKind::FAdd:
case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
- VectorCost =
- TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+ if (!AllConsts)
+ VectorCost =
+ TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
break;
}
case RecurKind::FMax:
case RecurKind::FMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*IsUnsigned=*/false, CostKind);
+ if (!AllConsts) {
+ auto *VecCondTy =
+ cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*IsUnsigned=*/false, CostKind);
+ }
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
@@ -9245,11 +11435,14 @@ private:
case RecurKind::UMax:
case RecurKind::UMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- bool IsUnsigned =
- RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
- VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
- CostKind);
+ if (!AllConsts) {
+ auto *VecCondTy =
+ cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ bool IsUnsigned =
+ RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ IsUnsigned, CostKind);
+ }
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
@@ -9463,7 +11656,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
/// performed.
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
+ TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI,
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
@@ -9482,7 +11676,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
- // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+ // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
// CmpInsts so we can skip extra attempts in
// tryToVectorizeHorReductionOrInstOperands and save compile time.
std::queue<std::pair<Instruction *, unsigned>> Stack;
@@ -9490,13 +11684,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(
SmallPtrSet<Value *, 8> VisitedInstrs;
SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = false;
- auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
+ Value *&B0,
+ Value *&B1) -> Value * {
+ if (R.isAnalyzedReductionRoot(Inst))
+ return nullptr;
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst))
+ if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
return HorRdx.tryToReduce(R, TTI);
}
return nullptr;
@@ -9541,7 +11738,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Do not try to vectorize CmpInst operands, this is done separately.
// Final attempt for binop args vectorization should happen after the loop
// to try to find reductions.
- if (!isa<CmpInst>(Inst))
+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
PostponedInsts.push_back(Inst);
}
@@ -9554,8 +11751,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
if (auto *I = dyn_cast<Instruction>(Op))
// Do not try to vectorize CmpInst operands, this is done
// separately.
- if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
- I->getParent() == BB)
+ if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
+ !R.isDeleted(I) && I->getParent() == BB)
Stack.emplace(I, Level);
}
// Try to vectorized binops where reductions were not found.
@@ -9579,8 +11776,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(I, R);
};
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
- ExtraVectorization);
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
+ *TLI, ExtraVectorization);
}
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -9748,12 +11945,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (isa<CmpInst>(I))
+ } else if (isa<CmpInst>(I)) {
PostponedCmps.push_back(I);
+ continue;
+ }
+ // Try to find reductions in buildvector sequnces.
+ OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
}
if (AtTerminator) {
// Try to find reductions first.
@@ -10171,7 +12372,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
DT->getNode(I2->getParent());
assert(NodeI1 && "Should only process reachable instructions");
- assert(NodeI1 && "Should only process reachable instructions");
+ assert(NodeI2 && "Should only process reachable instructions");
assert((NodeI1 == NodeI2) ==
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 8822c0004eb2..97f2b1a93815 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -72,17 +72,17 @@ class VPRecipeBuilder {
VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range, VPlanPtr &Plan);
- /// Check if an induction recipe should be constructed for \I. If so build and
- /// return it. If not, return null.
- VPWidenIntOrFpInductionRecipe *
- tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands,
- VFRange &Range) const;
+ /// Check if an induction recipe should be constructed for \p Phi. If so build
+ /// and return it. If not, return null.
+ VPRecipeBase *tryToOptimizeInductionPHI(PHINode *Phi,
+ ArrayRef<VPValue *> Operands,
+ VPlan &Plan, VFRange &Range);
/// Optimize the special case where the operand of \p I is a constant integer
/// induction variable.
VPWidenIntOrFpInductionRecipe *
tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
- VFRange &Range, VPlan &Plan) const;
+ VFRange &Range, VPlan &Plan);
/// Handle non-loop phi nodes. Return a VPValue, if all incoming values match
/// or a new VPBlendRecipe otherwise. Currently all such phi nodes are turned
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 342d4a074e10..4d709097c306 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -23,11 +23,10 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
-#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
@@ -35,13 +34,13 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GenericDomTreeConstruction.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <cassert>
-#include <iterator>
#include <string>
#include <vector>
@@ -60,7 +59,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
}
#endif
-Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder,
+Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
const ElementCount &VF) const {
switch (LaneKind) {
case VPLane::Kind::ScalableLast:
@@ -158,25 +157,25 @@ void VPBlockBase::setPlan(VPlan *ParentPlan) {
}
/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
-const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
+const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
const VPBlockBase *Block = this;
while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- Block = Region->getExit();
+ Block = Region->getExiting();
return cast<VPBasicBlock>(Block);
}
-VPBasicBlock *VPBlockBase::getExitBasicBlock() {
+VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
VPBlockBase *Block = this;
while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- Block = Region->getExit();
+ Block = Region->getExiting();
return cast<VPBasicBlock>(Block);
}
VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
if (!Successors.empty() || !Parent)
return this;
- assert(Parent->getExit() == this &&
- "Block w/o successors not the exit of its parent.");
+ assert(Parent->getExiting() == this &&
+ "Block w/o successors not the exiting block of its parent.");
return Parent->getEnclosingBlockWithSuccessors();
}
@@ -188,28 +187,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
return Parent->getEnclosingBlockWithPredecessors();
}
-VPValue *VPBlockBase::getCondBit() {
- return CondBitUser.getSingleOperandOrNull();
-}
-
-const VPValue *VPBlockBase::getCondBit() const {
- return CondBitUser.getSingleOperandOrNull();
-}
-
-void VPBlockBase::setCondBit(VPValue *CV) { CondBitUser.resetSingleOpUser(CV); }
-
-VPValue *VPBlockBase::getPredicate() {
- return PredicateUser.getSingleOperandOrNull();
-}
-
-const VPValue *VPBlockBase::getPredicate() const {
- return PredicateUser.getSingleOperandOrNull();
-}
-
-void VPBlockBase::setPredicate(VPValue *CV) {
- PredicateUser.resetSingleOpUser(CV);
-}
-
void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
@@ -245,6 +222,52 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
// set(Def, Extract, Instance);
return Extract;
}
+BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {
+ VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();
+ return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];
+}
+
+void VPTransformState::addNewMetadata(Instruction *To,
+ const Instruction *Orig) {
+ // If the loop was versioned with memchecks, add the corresponding no-alias
+ // metadata.
+ if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+ LVer->annotateInstWithNoAlias(To, Orig);
+}
+
+void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
+ propagateMetadata(To, From);
+ addNewMetadata(To, From);
+}
+
+void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
+ for (Value *V : To) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ addMetadata(I, From);
+ }
+}
+
+void VPTransformState::setDebugLocFromInst(const Value *V) {
+ if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
+ const DILocation *DIL = Inst->getDebugLoc();
+
+ // When a FSDiscriminator is enabled, we don't need to add the multiply
+ // factors to the discriminators.
+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+ !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
+ // FIXME: For scalable vectors, assume vscale=1.
+ auto NewDIL =
+ DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
+ if (NewDIL)
+ Builder.SetCurrentDebugLocation(*NewDIL);
+ else
+ LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ } else
+ Builder.SetCurrentDebugLocation(DIL);
+ } else
+ Builder.SetCurrentDebugLocation(DebugLoc());
+}
BasicBlock *
VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
@@ -252,43 +275,36 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
// Pred stands for Predessor. Prev stands for Previous - last visited/created.
BasicBlock *PrevBB = CFG.PrevBB;
BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
- PrevBB->getParent(), CFG.LastBB);
+ PrevBB->getParent(), CFG.ExitBB);
LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
// Hook up the new basic block to its predecessors.
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
- VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
- auto &PredVPSuccessors = PredVPBB->getSuccessors();
+ VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
+ auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
- // In outer loop vectorization scenario, the predecessor BBlock may not yet
- // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
- // vectorization. We do not encounter this case in inner loop vectorization
- // as we start out by building a loop skeleton with the vector loop header
- // and latch blocks. As a result, we never enter this function for the
- // header block in the non VPlan-native path.
- if (!PredBB) {
- assert(EnableVPlanNativePath &&
- "Unexpected null predecessor in non VPlan-native path");
- CFG.VPBBsToFix.push_back(PredVPBB);
- continue;
- }
-
assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+
+ auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
if (isa<UnreachableInst>(PredBBTerminator)) {
assert(PredVPSuccessors.size() == 1 &&
"Predecessor ending w/o branch must have single successor.");
+ DebugLoc DL = PredBBTerminator->getDebugLoc();
PredBBTerminator->eraseFromParent();
- BranchInst::Create(NewBB, PredBB);
+ auto *Br = BranchInst::Create(NewBB, PredBB);
+ Br->setDebugLoc(DL);
+ } else if (TermBr && !TermBr->isConditional()) {
+ TermBr->setSuccessor(0, NewBB);
} else {
- assert(PredVPSuccessors.size() == 2 &&
- "Predecessor ending with branch must have two successors.");
+ // Set each forward successor here when it is created, excluding
+ // backedges. A backward successor is set when the branch is created.
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
- assert(!PredBBTerminator->getSuccessor(idx) &&
+ assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
- PredBBTerminator->setSuccessor(idx, NewBB);
+ TermBr->setSuccessor(idx, NewBB);
}
}
return NewBB;
@@ -300,27 +316,51 @@ void VPBasicBlock::execute(VPTransformState *State) {
VPBlockBase *SingleHPred = nullptr;
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
- // 1. Create an IR basic block, or reuse the last one if possible.
- // The last IR basic block is reused, as an optimization, in three cases:
- // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
- // B. when the current VPBB has a single (hierarchical) predecessor which
- // is PrevVPBB and the latter has a single (hierarchical) successor; and
- // C. when the current VPBB is an entry of a region replica - where PrevVPBB
- // is the exit of this region from a previous instance, or the predecessor
- // of this region.
- if (PrevVPBB && /* A */
- !((SingleHPred = getSingleHierarchicalPredecessor()) &&
- SingleHPred->getExitBasicBlock() == PrevVPBB &&
- PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
- !(Replica && getPredecessors().empty())) { /* C */
+ auto IsLoopRegion = [](VPBlockBase *BB) {
+ auto *R = dyn_cast<VPRegionBlock>(BB);
+ return R && !R->isReplicator();
+ };
+
+ // 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
+ if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
+ // ExitBB can be re-used for the exit block of the Plan.
+ NewBB = State->CFG.ExitBB;
+ State->CFG.PrevBB = NewBB;
+
+ // Update the branch instruction in the predecessor to branch to ExitBB.
+ VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
+ VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
+ assert(PredVPB->getSingleSuccessor() == this &&
+ "predecessor must have the current block as only successor");
+ BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
+ // The Exit block of a loop is always set to be successor 0 of the Exiting
+ // block.
+ cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
+ } else if (PrevVPBB && /* A */
+ !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+ SingleHPred->getExitingBasicBlock() == PrevVPBB &&
+ PrevVPBB->getSingleHierarchicalSuccessor() &&
+ (SingleHPred->getParent() == getEnclosingLoopRegion() &&
+ !IsLoopRegion(SingleHPred))) && /* B */
+ !(Replica && getPredecessors().empty())) { /* C */
+ // The last IR basic block is reused, as an optimization, in three cases:
+ // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
+ // B. when the current VPBB has a single (hierarchical) predecessor which
+ // is PrevVPBB and the latter has a single (hierarchical) successor which
+ // both are in the same non-replicator region; and
+ // C. when the current VPBB is an entry of a region replica - where PrevVPBB
+ // is the exiting VPBB of this region from a previous instance, or the
+ // predecessor of this region.
+
NewBB = createEmptyBasicBlock(State->CFG);
State->Builder.SetInsertPoint(NewBB);
// Temporarily terminate with unreachable until CFG is rewired.
UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ // Register NewBB in its loop. In innermost loops its the same for all
+ // BB's.
+ if (State->CurrentVectorLoop)
+ State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
State->Builder.SetInsertPoint(Terminator);
- // Register NewBB in its loop. In innermost loops its the same for all BB's.
- Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
- L->addBasicBlockToLoop(NewBB, *State->LI);
State->CFG.PrevBB = NewBB;
}
@@ -334,29 +374,6 @@ void VPBasicBlock::execute(VPTransformState *State) {
for (VPRecipeBase &Recipe : Recipes)
Recipe.execute(*State);
- VPValue *CBV;
- if (EnableVPlanNativePath && (CBV = getCondBit())) {
- assert(CBV->getUnderlyingValue() &&
- "Unexpected null underlying value for condition bit");
-
- // Condition bit value in a VPBasicBlock is used as the branch selector. In
- // the VPlan-native path case, since all branches are uniform we generate a
- // branch instruction using the condition value from vector lane 0 and dummy
- // successors. The successors are fixed later when the successor blocks are
- // visited.
- Value *NewCond = State->get(CBV, {0, 0});
-
- // Replace the temporary unreachable terminator with the new conditional
- // branch.
- auto *CurrentTerminator = NewBB->getTerminator();
- assert(isa<UnreachableInst>(CurrentTerminator) &&
- "Expected to replace unreachable terminator with conditional "
- "branch.");
- auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
- CondBr->setSuccessor(0, nullptr);
- ReplaceInstWithInst(CurrentTerminator, CondBr);
- }
-
LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
}
@@ -395,6 +412,61 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
return SplitBlock;
}
+VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
+ VPRegionBlock *P = getParent();
+ if (P && P->isReplicator()) {
+ P = P->getParent();
+ assert(!cast<VPRegionBlock>(P)->isReplicator() &&
+ "unexpected nested replicate regions");
+ }
+ return P;
+}
+
+static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
+ if (VPBB->empty()) {
+ assert(
+ VPBB->getNumSuccessors() < 2 &&
+ "block with multiple successors doesn't have a recipe as terminator");
+ return false;
+ }
+
+ const VPRecipeBase *R = &VPBB->back();
+ auto *VPI = dyn_cast<VPInstruction>(R);
+ bool IsCondBranch =
+ isa<VPBranchOnMaskRecipe>(R) ||
+ (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond ||
+ VPI->getOpcode() == VPInstruction::BranchOnCount));
+ (void)IsCondBranch;
+
+ if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) {
+ assert(IsCondBranch && "block with multiple successors not terminated by "
+ "conditional branch recipe");
+
+ return true;
+ }
+
+ assert(
+ !IsCondBranch &&
+ "block with 0 or 1 successors terminated by conditional branch recipe");
+ return false;
+}
+
+VPRecipeBase *VPBasicBlock::getTerminator() {
+ if (hasConditionalTerminator(this))
+ return &back();
+ return nullptr;
+}
+
+const VPRecipeBase *VPBasicBlock::getTerminator() const {
+ if (hasConditionalTerminator(this))
+ return &back();
+ return nullptr;
+}
+
+bool VPBasicBlock::isExiting() const {
+ return getParent()->getExitingBasicBlock() == this;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
if (getSuccessors().empty()) {
@@ -411,13 +483,6 @@ void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << getName() << ":\n";
- if (const VPValue *Pred = getPredicate()) {
- O << Indent << "BlockPredicate:";
- Pred->printAsOperand(O, SlotTracker);
- if (const auto *PredInst = dyn_cast<VPInstruction>(Pred))
- O << " (" << PredInst->getParent()->getName() << ")";
- O << '\n';
- }
auto RecipeIndent = Indent + " ";
for (const VPRecipeBase &Recipe : *this) {
@@ -426,14 +491,6 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
}
printSuccessors(O, Indent);
-
- if (const VPValue *CBV = getCondBit()) {
- O << Indent << "CondBit: ";
- CBV->printAsOperand(O, SlotTracker);
- if (const auto *CBI = dyn_cast<VPInstruction>(CBV))
- O << " (" << CBI->getParent()->getName() << ")";
- O << '\n';
- }
}
#endif
@@ -448,25 +505,26 @@ void VPRegionBlock::execute(VPTransformState *State) {
ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
if (!isReplicator()) {
+ // Create and register the new vector loop.
+ Loop *PrevLoop = State->CurrentVectorLoop;
+ State->CurrentVectorLoop = State->LI->AllocateLoop();
+ BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
+ Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
+
+ // Insert the new loop into the loop nest and register the new basic blocks
+ // before calling any utilities such as SCEV that require valid LoopInfo.
+ if (ParentLoop)
+ ParentLoop->addChildLoop(State->CurrentVectorLoop);
+ else
+ State->LI->addTopLevelLoop(State->CurrentVectorLoop);
+
// Visit the VPBlocks connected to "this", starting from it.
for (VPBlockBase *Block : RPOT) {
- if (EnableVPlanNativePath) {
- // The inner loop vectorization path does not represent loop preheader
- // and exit blocks as part of the VPlan. In the VPlan-native path, skip
- // vectorizing loop preheader block. In future, we may replace this
- // check with the check for loop preheader.
- if (Block->getNumPredecessors() == 0)
- continue;
-
- // Skip vectorizing loop exit block. In future, we may replace this
- // check with the check for loop exit.
- if (Block->getNumSuccessors() == 0)
- continue;
- }
-
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
+
+ State->CurrentVectorLoop = PrevLoop;
return;
}
@@ -508,341 +566,32 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
}
#endif
-bool VPRecipeBase::mayWriteToMemory() const {
- switch (getVPDefID()) {
- case VPWidenMemoryInstructionSC: {
- return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
- }
- case VPReplicateSC:
- case VPWidenCallSC:
- return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
- ->mayWriteToMemory();
- case VPBranchOnMaskSC:
- return false;
- case VPWidenIntOrFpInductionSC:
- case VPWidenCanonicalIVSC:
- case VPWidenPHISC:
- case VPBlendSC:
- case VPWidenSC:
- case VPWidenGEPSC:
- case VPReductionSC:
- case VPWidenSelectSC: {
- const Instruction *I =
- dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
- (void)I;
- assert((!I || !I->mayWriteToMemory()) &&
- "underlying instruction may write to memory");
- return false;
- }
- default:
- return true;
- }
-}
-
-bool VPRecipeBase::mayReadFromMemory() const {
- switch (getVPDefID()) {
- case VPWidenMemoryInstructionSC: {
- return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
- }
- case VPReplicateSC:
- case VPWidenCallSC:
- return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
- ->mayReadFromMemory();
- case VPBranchOnMaskSC:
- return false;
- case VPWidenIntOrFpInductionSC:
- case VPWidenCanonicalIVSC:
- case VPWidenPHISC:
- case VPBlendSC:
- case VPWidenSC:
- case VPWidenGEPSC:
- case VPReductionSC:
- case VPWidenSelectSC: {
- const Instruction *I =
- dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
- (void)I;
- assert((!I || !I->mayReadFromMemory()) &&
- "underlying instruction may read from memory");
- return false;
- }
- default:
- return true;
- }
-}
-
-bool VPRecipeBase::mayHaveSideEffects() const {
- switch (getVPDefID()) {
- case VPBranchOnMaskSC:
- return false;
- case VPWidenIntOrFpInductionSC:
- case VPWidenCanonicalIVSC:
- case VPWidenPHISC:
- case VPBlendSC:
- case VPWidenSC:
- case VPWidenGEPSC:
- case VPReductionSC:
- case VPWidenSelectSC: {
- const Instruction *I =
- dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
- (void)I;
- assert((!I || !I->mayHaveSideEffects()) &&
- "underlying instruction has side-effects");
- return false;
- }
- case VPReplicateSC: {
- auto *R = cast<VPReplicateRecipe>(this);
- return R->getUnderlyingInstr()->mayHaveSideEffects();
- }
- default:
- return true;
- }
-}
-
-void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
- assert(!Parent && "Recipe already in some VPBasicBlock");
- assert(InsertPos->getParent() &&
- "Insertion position not in any VPBasicBlock");
- Parent = InsertPos->getParent();
- Parent->getRecipeList().insert(InsertPos->getIterator(), this);
-}
-
-void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
- assert(!Parent && "Recipe already in some VPBasicBlock");
- assert(InsertPos->getParent() &&
- "Insertion position not in any VPBasicBlock");
- Parent = InsertPos->getParent();
- Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
-}
-
-void VPRecipeBase::removeFromParent() {
- assert(getParent() && "Recipe not in any VPBasicBlock");
- getParent()->getRecipeList().remove(getIterator());
- Parent = nullptr;
-}
-
-iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
- assert(getParent() && "Recipe not in any VPBasicBlock");
- return getParent()->getRecipeList().erase(getIterator());
-}
-
-void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
- removeFromParent();
- insertAfter(InsertPos);
-}
-
-void VPRecipeBase::moveBefore(VPBasicBlock &BB,
- iplist<VPRecipeBase>::iterator I) {
- assert(I == BB.end() || I->getParent() == &BB);
- removeFromParent();
- Parent = &BB;
- BB.getRecipeList().insert(I, this);
-}
-
-void VPInstruction::generateInstruction(VPTransformState &State,
- unsigned Part) {
- IRBuilder<> &Builder = State.Builder;
- Builder.SetCurrentDebugLocation(DL);
-
- if (Instruction::isBinaryOp(getOpcode())) {
- Value *A = State.get(getOperand(0), Part);
- Value *B = State.get(getOperand(1), Part);
- Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
- State.set(this, V, Part);
- return;
- }
-
- switch (getOpcode()) {
- case VPInstruction::Not: {
- Value *A = State.get(getOperand(0), Part);
- Value *V = Builder.CreateNot(A);
- State.set(this, V, Part);
- break;
- }
- case VPInstruction::ICmpULE: {
- Value *IV = State.get(getOperand(0), Part);
- Value *TC = State.get(getOperand(1), Part);
- Value *V = Builder.CreateICmpULE(IV, TC);
- State.set(this, V, Part);
- break;
- }
- case Instruction::Select: {
- Value *Cond = State.get(getOperand(0), Part);
- Value *Op1 = State.get(getOperand(1), Part);
- Value *Op2 = State.get(getOperand(2), Part);
- Value *V = Builder.CreateSelect(Cond, Op1, Op2);
- State.set(this, V, Part);
- break;
- }
- case VPInstruction::ActiveLaneMask: {
- // Get first lane of vector induction variable.
- Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
- // Get the original loop tripcount.
- Value *ScalarTC = State.get(getOperand(1), Part);
-
- auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
- Instruction *Call = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
- {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
- State.set(this, Call, Part);
- break;
- }
- case VPInstruction::FirstOrderRecurrenceSplice: {
- // Generate code to combine the previous and current values in vector v3.
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
-
- // For the first part, use the recurrence phi (v1), otherwise v2.
- auto *V1 = State.get(getOperand(0), 0);
- Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
- if (!PartMinus1->getType()->isVectorTy()) {
- State.set(this, PartMinus1, Part);
- } else {
- Value *V2 = State.get(getOperand(1), Part);
- State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
- }
- break;
- }
-
- case VPInstruction::CanonicalIVIncrement:
- case VPInstruction::CanonicalIVIncrementNUW: {
- Value *Next = nullptr;
- if (Part == 0) {
- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
- auto *Phi = State.get(getOperand(0), 0);
- // The loop step is equal to the vectorization factor (num of SIMD
- // elements) times the unroll factor (num of SIMD instructions).
- Value *Step =
- createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
- Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
- } else {
- Next = State.get(this, 0);
- }
-
- State.set(this, Next, Part);
- break;
- }
- case VPInstruction::BranchOnCount: {
- if (Part != 0)
- break;
- // First create the compare.
- Value *IV = State.get(getOperand(0), Part);
- Value *TC = State.get(getOperand(1), Part);
- Value *Cond = Builder.CreateICmpEQ(IV, TC);
-
- // Now create the branch.
- auto *Plan = getParent()->getPlan();
- VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
- VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
- if (Header->empty()) {
- assert(EnableVPlanNativePath &&
- "empty entry block only expected in VPlanNativePath");
- Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
+ Value *CanonicalIVStartValue,
+ VPTransformState &State,
+ bool IsEpilogueVectorization) {
+
+ VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
+ auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
+ // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
+ // preparing to execute the plan for the main vector loop.
+ if (!IsEpilogueVectorization && Term &&
+ Term->getOpcode() == VPInstruction::BranchOnCount &&
+ isa<ConstantInt>(TripCountV)) {
+ ConstantInt *C = cast<ConstantInt>(TripCountV);
+ uint64_t TCVal = C->getZExtValue();
+ if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {
+ auto *BOC =
+ new VPInstruction(VPInstruction::BranchOnCond,
+ {getOrAddExternalDef(State.Builder.getTrue())});
+ Term->eraseFromParent();
+ ExitingVPBB->appendRecipe(BOC);
+ // TODO: Further simplifications are possible
+ // 1. Replace inductions with constants.
+ // 2. Replace vector loop region with VPBasicBlock.
}
- // TODO: Once the exit block is modeled in VPlan, use it instead of going
- // through State.CFG.LastBB.
- BasicBlock *Exit =
- cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0);
-
- Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]);
- Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
- break;
- }
- default:
- llvm_unreachable("Unsupported opcode for instruction");
- }
-}
-
-void VPInstruction::execute(VPTransformState &State) {
- assert(!State.Instance && "VPInstruction executing an Instance");
- IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
- State.Builder.setFastMathFlags(FMF);
- for (unsigned Part = 0; Part < State.UF; ++Part)
- generateInstruction(State, Part);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInstruction::dump() const {
- VPSlotTracker SlotTracker(getParent()->getPlan());
- print(dbgs(), "", SlotTracker);
-}
-
-void VPInstruction::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "EMIT ";
-
- if (hasResult()) {
- printAsOperand(O, SlotTracker);
- O << " = ";
- }
-
- switch (getOpcode()) {
- case VPInstruction::Not:
- O << "not";
- break;
- case VPInstruction::ICmpULE:
- O << "icmp ule";
- break;
- case VPInstruction::SLPLoad:
- O << "combined load";
- break;
- case VPInstruction::SLPStore:
- O << "combined store";
- break;
- case VPInstruction::ActiveLaneMask:
- O << "active lane mask";
- break;
- case VPInstruction::FirstOrderRecurrenceSplice:
- O << "first-order splice";
- break;
- case VPInstruction::CanonicalIVIncrement:
- O << "VF * UF + ";
- break;
- case VPInstruction::CanonicalIVIncrementNUW:
- O << "VF * UF +(nuw) ";
- break;
- case VPInstruction::BranchOnCount:
- O << "branch-on-count ";
- break;
- default:
- O << Instruction::getOpcodeName(getOpcode());
- }
-
- O << FMF;
-
- for (const VPValue *Operand : operands()) {
- O << " ";
- Operand->printAsOperand(O, SlotTracker);
}
- if (DL) {
- O << ", !dbg ";
- DL.print(O);
- }
-}
-#endif
-
-void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
- // Make sure the VPInstruction is a floating-point operation.
- assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
- Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
- Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
- Opcode == Instruction::FCmp) &&
- "this op can't take fast-math flags");
- FMF = FMFNew;
-}
-
-void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
- Value *CanonicalIVStartValue,
- VPTransformState &State) {
// Check if the trip count is needed, and if so build it.
if (TripCount && TripCount->getNumUsers()) {
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
@@ -868,111 +617,78 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
// When vectorizing the epilogue loop, the canonical induction start value
// needs to be changed from zero to the value after the main vector loop.
if (CanonicalIVStartValue) {
- VPValue *VPV = new VPValue(CanonicalIVStartValue);
- addExternalDef(VPV);
+ VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue);
auto *IV = getCanonicalIV();
assert(all_of(IV->users(),
[](const VPUser *U) {
+ if (isa<VPScalarIVStepsRecipe>(U))
+ return true;
auto *VPI = cast<VPInstruction>(U);
return VPI->getOpcode() ==
VPInstruction::CanonicalIVIncrement ||
VPI->getOpcode() ==
VPInstruction::CanonicalIVIncrementNUW;
}) &&
- "the canonical IV should only be used by its increments when "
+ "the canonical IV should only be used by its increments or "
+ "ScalarIVSteps when "
"resetting the start value");
IV->setOperand(0, VPV);
}
}
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
+/// Generate the code inside the preheader and body of the vectorized loop.
+/// Assumes a single pre-header basic-block was created for this. Introduce
+/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
- // 0. Set the reverse mapping from VPValues to Values for code generation.
+ // Set the reverse mapping from VPValues to Values for code generation.
for (auto &Entry : Value2VPValue)
State->VPValue2Value[Entry.second] = Entry.first;
- BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
- State->CFG.VectorPreHeader = VectorPreHeaderBB;
- BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
- assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
-
- // 1. Make room to generate basic-blocks inside loop body if needed.
- BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(
- VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
- Loop *L = State->LI->getLoopFor(VectorHeaderBB);
- L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
- // Remove the edge between Header and Latch to allow other connections.
- // Temporarily terminate with unreachable until CFG is rewired.
- // Note: this asserts the generated code's assumption that
- // getFirstInsertionPt() can be dereferenced into an Instruction.
- VectorHeaderBB->getTerminator()->eraseFromParent();
- State->Builder.SetInsertPoint(VectorHeaderBB);
- UnreachableInst *Terminator = State->Builder.CreateUnreachable();
- State->Builder.SetInsertPoint(Terminator);
-
- // 2. Generate code in loop body.
+ // Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
- State->CFG.PrevBB = VectorHeaderBB;
- State->CFG.LastBB = VectorLatchBB;
+ State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
+ BasicBlock *VectorPreHeader = State->CFG.PrevBB;
+ State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
+ // Generate code in the loop pre-header and body.
for (VPBlockBase *Block : depth_first(Entry))
Block->execute(State);
- // Setup branch terminator successors for VPBBs in VPBBsToFix based on
- // VPBB's successors.
- for (auto VPBB : State->CFG.VPBBsToFix) {
- assert(EnableVPlanNativePath &&
- "Unexpected VPBBsToFix in non VPlan-native path");
- BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
- assert(BB && "Unexpected null basic block for VPBB");
-
- unsigned Idx = 0;
- auto *BBTerminator = BB->getTerminator();
-
- for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
- VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
- BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
- ++Idx;
- }
- }
-
- // 3. Merge the temporary latch created with the last basic-block filled.
- BasicBlock *LastBB = State->CFG.PrevBB;
- assert(isa<BranchInst>(LastBB->getTerminator()) &&
- "Expected VPlan CFG to terminate with branch");
-
- // Move both the branch and check from LastBB to VectorLatchBB.
- auto *LastBranch = cast<BranchInst>(LastBB->getTerminator());
- LastBranch->moveBefore(VectorLatchBB->getTerminator());
- VectorLatchBB->getTerminator()->eraseFromParent();
- // Move condition so it is guaranteed to be next to branch. This is only done
- // to avoid excessive test updates.
- // TODO: Remove special handling once the increments for all inductions are
- // modeled explicitly in VPlan.
- cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch);
- // Connect LastBB to VectorLatchBB to facilitate their merge.
- BranchInst::Create(VectorLatchBB, LastBB);
-
- // Merge LastBB with Latch.
- bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
- (void)Merged;
- assert(Merged && "Could not merge last basic block with latch.");
- VectorLatchBB = LastBB;
+ VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
+ BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
- VPBasicBlock *Header = Entry->getEntryBasicBlock();
- if (Header->empty()) {
- assert(EnableVPlanNativePath);
- Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
- }
+ VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
// Skip phi-like recipes that generate their backedege values themselves.
- // TODO: Model their backedge values explicitly.
- if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R))
+ if (isa<VPWidenPHIRecipe>(&R))
+ continue;
+
+ if (isa<VPWidenPointerInductionRecipe>(&R) ||
+ isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ PHINode *Phi = nullptr;
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
+ } else {
+ auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+ // TODO: Split off the case that all users of a pointer phi are scalar
+ // from the VPWidenPointerInductionRecipe.
+ if (WidenPhi->onlyScalarsGenerated(State->VF))
+ continue;
+
+ auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
+ Phi = cast<PHINode>(GEP->getPointerOperand());
+ }
+
+ Phi->setIncomingBlock(1, VectorLatchBB);
+
+ // Move the last step to the end of the latch block. This ensures
+ // consistent placement of all induction updates.
+ Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
continue;
+ }
auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
// For canonical IV, first-order recurrences and in-order reduction phis,
@@ -993,9 +709,12 @@ void VPlan::execute(VPTransformState *State) {
}
// We do not attempt to preserve DT for outer loop vectorization currently.
- if (!EnableVPlanNativePath)
- updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
- L->getExitBlock());
+ if (!EnableVPlanNativePath) {
+ BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
+ State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
+ updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
+ State->CFG.ExitBB);
+ }
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1021,6 +740,17 @@ void VPlan::print(raw_ostream &O) const {
O << '\n';
Block->print(O, "", SlotTracker);
}
+
+ if (!LiveOuts.empty())
+ O << "\n";
+ for (auto &KV : LiveOuts) {
+ O << "Live-out ";
+ KV.second->getPhi()->printAsOperand(O);
+ O << " = ";
+ KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
+ O << "\n";
+ }
+
O << "}\n";
}
@@ -1034,11 +764,14 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
+void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
+ assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
+ LiveOuts.insert({PN, new VPLiveOut(PN, V)});
+}
+
+void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
BasicBlock *LoopLatchBB,
BasicBlock *LoopExitBB) {
- BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
- assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
// The vector body may be more than a single basic-block by this point.
// Update the dominator tree information inside the vector body by propagating
// it from header to latch, expecting only triangular control-flow, if any.
@@ -1075,6 +808,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
Twine(getOrCreateBID(Block));
@@ -1122,8 +856,8 @@ void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
bool Hidden, const Twine &Label) {
// Due to "dot" we print an edge between two regions as an edge between the
- // exit basic block and the entry basic of the respective regions.
- const VPBlockBase *Tail = From->getExitBasicBlock();
+ // exiting basic block and the entry basic of the respective regions.
+ const VPBlockBase *Tail = From->getExitingBasicBlock();
const VPBlockBase *Head = To->getEntryBasicBlock();
OS << Indent << getUID(Tail) << " -> " << getUID(Head);
OS << " [ label=\"" << Label << '\"';
@@ -1213,328 +947,6 @@ void VPlanIngredient::print(raw_ostream &O) const {
V->printAsOperand(O, false);
}
-void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-CALL ";
-
- auto *CI = cast<CallInst>(getUnderlyingInstr());
- if (CI->getType()->isVoidTy())
- O << "void ";
- else {
- printAsOperand(O, SlotTracker);
- O << " = ";
- }
-
- O << "call @" << CI->getCalledFunction()->getName() << "(";
- printOperands(O, SlotTracker);
- O << ")";
-}
-
-void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-SELECT ";
- printAsOperand(O, SlotTracker);
- O << " = select ";
- getOperand(0)->printAsOperand(O, SlotTracker);
- O << ", ";
- getOperand(1)->printAsOperand(O, SlotTracker);
- O << ", ";
- getOperand(2)->printAsOperand(O, SlotTracker);
- O << (InvariantCond ? " (condition is loop invariant)" : "");
-}
-
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN ";
- printAsOperand(O, SlotTracker);
- O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
- printOperands(O, SlotTracker);
-}
-
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-INDUCTION";
- if (getTruncInst()) {
- O << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
- O << " +\n" << Indent << "\" ";
- getVPValue(0)->printAsOperand(O, SlotTracker);
- } else
- O << " " << VPlanIngredient(IV);
-}
-#endif
-
-bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
- auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
- return StartC && StartC->isZero() && StepC && StepC->isOne();
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-GEP ";
- O << (IsPtrLoopInvariant ? "Inv" : "Var");
- size_t IndicesNumber = IsIndexLoopInvariant.size();
- for (size_t I = 0; I < IndicesNumber; ++I)
- O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
-
- O << " ";
- printAsOperand(O, SlotTracker);
- O << " = getelementptr ";
- printOperands(O, SlotTracker);
-}
-
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-PHI ";
-
- auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
- // Unless all incoming values are modeled in VPlan print the original PHI
- // directly.
- // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
- // values as VPValues.
- if (getNumOperands() != OriginalPhi->getNumOperands()) {
- O << VPlanIngredient(OriginalPhi);
- return;
- }
-
- printAsOperand(O, SlotTracker);
- O << " = phi ";
- printOperands(O, SlotTracker);
-}
-
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "BLEND ";
- Phi->printAsOperand(O, false);
- O << " =";
- if (getNumIncomingValues() == 1) {
- // Not a User of any mask: not really blending, this is a
- // single-predecessor phi.
- O << " ";
- getIncomingValue(0)->printAsOperand(O, SlotTracker);
- } else {
- for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
- O << " ";
- getIncomingValue(I)->printAsOperand(O, SlotTracker);
- O << "/";
- getMask(I)->printAsOperand(O, SlotTracker);
- }
- }
-}
-
-void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "REDUCE ";
- printAsOperand(O, SlotTracker);
- O << " = ";
- getChainOp()->printAsOperand(O, SlotTracker);
- O << " +";
- if (isa<FPMathOperator>(getUnderlyingInstr()))
- O << getUnderlyingInstr()->getFastMathFlags();
- O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
- getVecOp()->printAsOperand(O, SlotTracker);
- if (getCondOp()) {
- O << ", ";
- getCondOp()->printAsOperand(O, SlotTracker);
- }
- O << ")";
-}
-
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
-
- if (!getUnderlyingInstr()->getType()->isVoidTy()) {
- printAsOperand(O, SlotTracker);
- O << " = ";
- }
- O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
- printOperands(O, SlotTracker);
-
- if (AlsoPack)
- O << " (S->V)";
-}
-
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "PHI-PREDICATED-INSTRUCTION ";
- printAsOperand(O, SlotTracker);
- O << " = ";
- printOperands(O, SlotTracker);
-}
-
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN ";
-
- if (!isStore()) {
- printAsOperand(O, SlotTracker);
- O << " = ";
- }
- O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
-
- printOperands(O, SlotTracker);
-}
-#endif
-
-void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
- Value *Start = getStartValue()->getLiveInIRValue();
- PHINode *EntryPart = PHINode::Create(
- Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
- EntryPart->addIncoming(Start, State.CFG.VectorPreHeader);
- EntryPart->setDebugLoc(DL);
- for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
- State.set(this, EntryPart, Part);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "EMIT ";
- printAsOperand(O, SlotTracker);
- O << " = CANONICAL-INDUCTION";
-}
-#endif
-
-void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
- Value *CanonicalIV = State.get(getOperand(0), 0);
- Type *STy = CanonicalIV->getType();
- IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
- ElementCount VF = State.VF;
- Value *VStart = VF.isScalar()
- ? CanonicalIV
- : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
- for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
- Value *VStep = createStepForVF(Builder, STy, VF, Part);
- if (VF.isVector()) {
- VStep = Builder.CreateVectorSplat(VF, VStep);
- VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
- }
- Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
- State.set(this, CanonicalVectorIV, Part);
- }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "EMIT ";
- printAsOperand(O, SlotTracker);
- O << " = WIDEN-CANONICAL-INDUCTION ";
- printOperands(O, SlotTracker);
-}
-#endif
-
-void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
- auto &Builder = State.Builder;
- // Create a vector from the initial value.
- auto *VectorInit = getStartValue()->getLiveInIRValue();
-
- Type *VecTy = State.VF.isScalar()
- ? VectorInit->getType()
- : VectorType::get(VectorInit->getType(), State.VF);
-
- if (State.VF.isVector()) {
- auto *IdxTy = Builder.getInt32Ty();
- auto *One = ConstantInt::get(IdxTy, 1);
- IRBuilder<>::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
- auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
- VectorInit = Builder.CreateInsertElement(
- PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
- }
-
- // Create a phi node for the new recurrence.
- PHINode *EntryPart = PHINode::Create(
- VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
- EntryPart->addIncoming(VectorInit, State.CFG.VectorPreHeader);
- State.set(this, EntryPart, 0);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
- printAsOperand(O, SlotTracker);
- O << " = phi ";
- printOperands(O, SlotTracker);
-}
-#endif
-
-void VPReductionPHIRecipe::execute(VPTransformState &State) {
- PHINode *PN = cast<PHINode>(getUnderlyingValue());
- auto &Builder = State.Builder;
-
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #1: We create a new vector PHI node with no incoming edges. We'll use
- // this value when we vectorize all of the instructions that use the PHI.
- bool ScalarPHI = State.VF.isScalar() || IsInLoop;
- Type *VecTy =
- ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
-
- BasicBlock *HeaderBB = State.CFG.PrevBB;
- assert(State.LI->getLoopFor(HeaderBB)->getHeader() == HeaderBB &&
- "recipe must be in the vector loop header");
- unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
- for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *EntryPart =
- PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
- State.set(this, EntryPart, Part);
- }
-
- // Reductions do not have to start at zero. They can start with
- // any loop invariant values.
- VPValue *StartVPV = getStartValue();
- Value *StartV = StartVPV->getLiveInIRValue();
-
- Value *Iden = nullptr;
- RecurKind RK = RdxDesc.getRecurrenceKind();
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
- RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
- // MinMax reduction have the start value as their identify.
- if (ScalarPHI) {
- Iden = StartV;
- } else {
- IRBuilderBase::InsertPointGuard IPBuilder(Builder);
- Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
- StartV = Iden =
- Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
- }
- } else {
- Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
- RdxDesc.getFastMathFlags());
-
- if (!ScalarPHI) {
- Iden = Builder.CreateVectorSplat(State.VF, Iden);
- IRBuilderBase::InsertPointGuard IPBuilder(Builder);
- Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
- Constant *Zero = Builder.getInt32(0);
- StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
- }
- }
-
- for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *EntryPart = State.get(this, Part);
- // Make sure to add the reduction start value only to the
- // first unroll part.
- Value *StartVal = (Part == 0) ? StartV : Iden;
- cast<PHINode>(EntryPart)->addIncoming(StartVal, State.CFG.VectorPreHeader);
- }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-REDUCTION-PHI ";
-
- printAsOperand(O, SlotTracker);
- O << " = phi ";
- printOperands(O, SlotTracker);
-}
#endif
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
@@ -1594,7 +1006,10 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
continue;
assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
auto *VPInst = cast<VPInstruction>(&VPI);
- auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+
+ auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
+ if (!Inst)
+ continue;
auto *IG = IAI.getInterleaveGroup(Inst);
if (!IG)
continue;
@@ -1622,7 +1037,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
InterleavedAccessInfo &IAI) {
Old2NewTy Old2New;
- visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
+ visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
}
void VPSlotTracker::assignSlot(const VPValue *V) {
@@ -1632,8 +1047,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) {
void VPSlotTracker::assignSlots(const VPlan &Plan) {
- for (const VPValue *V : Plan.VPExternalDefs)
- assignSlot(V);
+ for (const auto &P : Plan.VPExternalDefs)
+ assignSlot(P.second);
assignSlot(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
@@ -1651,7 +1066,19 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
}
bool vputils::onlyFirstLaneUsed(VPValue *Def) {
- return all_of(Def->users(), [Def](VPUser *U) {
- return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(Def);
- });
+ return all_of(Def->users(),
+ [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+}
+
+VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
+ ScalarEvolution &SE) {
+ if (auto *E = dyn_cast<SCEVConstant>(Expr))
+ return Plan.getOrAddExternalDef(E->getValue());
+ if (auto *E = dyn_cast<SCEVUnknown>(Expr))
+ return Plan.getOrAddExternalDef(E->getValue());
+
+ VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
+ VPValue *Step = new VPExpandSCEVRecipe(Expr, SE);
+ Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef()));
+ return Step;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index bcaabca692cc..09da4a545d0d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -25,27 +25,26 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
-#include "VPlanLoopInfo.h"
#include "VPlanValue.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/ilist.h"
#include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/InstructionCost.h"
+#include "llvm/IR/FMF.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
-#include <map>
#include <string>
namespace llvm {
@@ -54,6 +53,7 @@ class BasicBlock;
class DominatorTree;
class InductionDescriptor;
class InnerLoopVectorizer;
+class IRBuilderBase;
class LoopInfo;
class raw_ostream;
class RecurrenceDescriptor;
@@ -67,10 +67,11 @@ class VPlanSlp;
/// Returns a calculation for the total number of elements for a given \p VF.
/// For fixed width vectors this value is a constant, whereas for scalable
/// vectors it is an expression determined at runtime.
-Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
/// Return a value for Step multiplied by VF.
-Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step);
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
+ int64_t Step);
/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
@@ -151,7 +152,7 @@ public:
/// Returns an expression describing the lane index that can be used at
/// runtime.
- Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const;
+ Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
/// Returns the Kind of lane offset.
Kind getKind() const { return LaneKind; }
@@ -199,10 +200,10 @@ struct VPIteration {
/// needed for generating the output IR.
struct VPTransformState {
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
- DominatorTree *DT, IRBuilder<> &Builder,
+ DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan)
- : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) {
- }
+ : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
+ LVer(nullptr) {}
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
@@ -298,6 +299,27 @@ struct VPTransformState {
Iter->second[Instance.Part][CacheIdx] = V;
}
+ /// Add additional metadata to \p To that was not present on \p Orig.
+ ///
+ /// Currently this is used to add the noalias annotations based on the
+ /// inserted memchecks. Use this for instructions that are *cloned* into the
+ /// vector loop.
+ void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+ /// Add metadata from one instruction to another.
+ ///
+ /// This includes both the original MDs from \p From and additional ones (\see
+ /// addNewMetadata). Use this for *newly created* instructions in the vector
+ /// loop.
+ void addMetadata(Instruction *To, Instruction *From);
+
+ /// Similar to the previous function but it adds the metadata to a
+ /// vector of instructions.
+ void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
+ /// Set the debug location in the builder using the debug location in \p V.
+ void setDebugLocFromInst(const Value *V);
+
/// Hold state information used when constructing the CFG of the output IR,
/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
struct CFGState {
@@ -308,26 +330,19 @@ struct VPTransformState {
/// header BasicBlock.
BasicBlock *PrevBB = nullptr;
- /// The last IR BasicBlock in the output IR. Set to the new latch
- /// BasicBlock, used for placing the newly created BasicBlocks.
- BasicBlock *LastBB = nullptr;
-
- /// The IR BasicBlock that is the preheader of the vector loop in the output
- /// IR.
- /// FIXME: The vector preheader should also be modeled in VPlan, so any code
- /// that needs to be added to the preheader gets directly generated by
- /// VPlan. There should be no need to manage a pointer to the IR BasicBlock.
- BasicBlock *VectorPreHeader = nullptr;
+ /// The last IR BasicBlock in the output IR. Set to the exit block of the
+ /// vector loop.
+ BasicBlock *ExitBB = nullptr;
/// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
/// of replication, maps the BasicBlock of the last replica created.
SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
- /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
- /// up at the end of vector code generation.
- SmallVector<VPBasicBlock *, 8> VPBBsToFix;
-
CFGState() = default;
+
+ /// Returns the BasicBlock* mapped to the pre-header of the loop region
+ /// containing \p R.
+ BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
} CFG;
/// Hold a pointer to LoopInfo to register new basic blocks in the loop.
@@ -337,7 +352,7 @@ struct VPTransformState {
DominatorTree *DT;
/// Hold a reference to the IRBuilder used to generate output IR code.
- IRBuilder<> &Builder;
+ IRBuilderBase &Builder;
VPValue2ValueTy VPValue2Value;
@@ -353,41 +368,16 @@ struct VPTransformState {
/// Holds recipes that may generate a poison value that is used after
/// vectorization, even when their operands are not poison.
SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
-};
-
-/// VPUsers instance used by VPBlockBase to manage CondBit and the block
-/// predicate. Currently VPBlockUsers are used in VPBlockBase for historical
-/// reasons, but in the future the only VPUsers should either be recipes or
-/// live-outs.VPBlockBase uses.
-struct VPBlockUser : public VPUser {
- VPBlockUser() : VPUser({}, VPUserID::Block) {}
- VPValue *getSingleOperandOrNull() {
- if (getNumOperands() == 1)
- return getOperand(0);
+ /// The loop object for the current parent region, or nullptr.
+ Loop *CurrentVectorLoop = nullptr;
- return nullptr;
- }
- const VPValue *getSingleOperandOrNull() const {
- if (getNumOperands() == 1)
- return getOperand(0);
-
- return nullptr;
- }
-
- void resetSingleOpUser(VPValue *NewVal) {
- assert(getNumOperands() <= 1 && "Didn't expect more than one operand!");
- if (!NewVal) {
- if (getNumOperands() == 1)
- removeLastOperand();
- return;
- }
-
- if (getNumOperands() == 1)
- setOperand(0, NewVal);
- else
- addOperand(NewVal);
- }
+ /// LoopVersioning. It's only set up (non-null) if memchecks were
+ /// used.
+ ///
+ /// This is currently only used to add no-alias metadata based on the
+ /// memchecks. The actually versioning is performed manually.
+ std::unique_ptr<LoopVersioning> LVer;
};
/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
@@ -410,16 +400,6 @@ class VPBlockBase {
/// List of successor blocks.
SmallVector<VPBlockBase *, 1> Successors;
- /// Successor selector managed by a VPUser. For blocks with zero or one
- /// successors, there is no operand. Otherwise there is exactly one operand
- /// which is the branch condition.
- VPBlockUser CondBitUser;
-
- /// If the block is predicated, its predicate is stored as an operand of this
- /// VPUser to maintain the def-use relations. Otherwise there is no operand
- /// here.
- VPBlockUser PredicateUser;
-
/// VPlan containing the block. Can only be set on the entry block of the
/// plan.
VPlan *Plan = nullptr;
@@ -493,11 +473,11 @@ public:
const VPBasicBlock *getEntryBasicBlock() const;
VPBasicBlock *getEntryBasicBlock();
- /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+ /// \return the VPBasicBlock that is the exiting this VPBlockBase,
/// recursively, if the latter is a VPRegionBlock. Otherwise, if this
/// VPBlockBase is a VPBasicBlock, it is returned.
- const VPBasicBlock *getExitBasicBlock() const;
- VPBasicBlock *getExitBasicBlock();
+ const VPBasicBlock *getExitingBasicBlock() const;
+ VPBasicBlock *getExitingBasicBlock();
const VPBlocksTy &getSuccessors() const { return Successors; }
VPBlocksTy &getSuccessors() { return Successors; }
@@ -565,20 +545,6 @@ public:
return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
}
- /// \return the condition bit selecting the successor.
- VPValue *getCondBit();
- /// \return the condition bit selecting the successor.
- const VPValue *getCondBit() const;
- /// Set the condition bit selecting the successor.
- void setCondBit(VPValue *CV);
-
- /// \return the block's predicate.
- VPValue *getPredicate();
- /// \return the block's predicate.
- const VPValue *getPredicate() const;
- /// Set the block's predicate.
- void setPredicate(VPValue *Pred);
-
/// Set a given VPBlockBase \p Successor as the single successor of this
/// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
/// This VPBlockBase must have no successors.
@@ -588,14 +554,11 @@ public:
}
/// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
- /// successors of this VPBlockBase. \p Condition is set as the successor
- /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
- /// IfFalse. This VPBlockBase must have no successors.
- void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
- VPValue *Condition) {
+ /// successors of this VPBlockBase. This VPBlockBase is not added as
+ /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
+ /// successors.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
assert(Successors.empty() && "Setting two successors when others exist.");
- assert(Condition && "Setting two successors without condition!");
- setCondBit(Condition);
appendSuccessor(IfTrue);
appendSuccessor(IfFalse);
}
@@ -612,11 +575,8 @@ public:
/// Remove all the predecessor of this block.
void clearPredecessors() { Predecessors.clear(); }
- /// Remove all the successors of this block and set to null its condition bit
- void clearSuccessors() {
- Successors.clear();
- setCondBit(nullptr);
- }
+ /// Remove all the successors of this block.
+ void clearSuccessors() { Successors.clear(); }
/// The method which generates the output IR that correspond to this
/// VPBlockBase, thereby "executing" the VPlan.
@@ -665,6 +625,32 @@ public:
#endif
};
+/// A value that is used outside the VPlan. The operand of the user needs to be
+/// added to the associated LCSSA phi node.
+class VPLiveOut : public VPUser {
+ PHINode *Phi;
+
+public:
+ VPLiveOut(PHINode *Phi, VPValue *Op)
+ : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
+
+ /// Fixup the wrapped LCSSA phi node in the unique exit block. This simply
+ /// means we need to add the appropriate incoming value from the middle
+ /// block as exiting edges from the scalar epilogue loop (if present) are
+ /// already in place, and we exit the vector loop exclusively to the middle
+ /// block.
+ void fixPhi(VPlan &Plan, VPTransformState &State);
+
+ /// Returns true if the VPLiveOut uses scalars of operand \p Op.
+ bool usesScalars(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+ PHINode *getPhi() const { return Phi; }
+};
+
/// VPRecipeBase is a base class modeling a sequence of one or more output IR
/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
/// and is responsible for deleting its defined values. Single-value
@@ -699,6 +685,9 @@ public:
/// Insert an unlinked recipe into a basic block immediately before
/// the specified recipe.
void insertBefore(VPRecipeBase *InsertPos);
+ /// Insert an unlinked recipe into \p BB immediately before the insertion
+ /// point \p IP;
+ void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
/// Insert an unlinked Recipe into a basic block immediately after
/// the specified Recipe.
@@ -759,14 +748,6 @@ public:
bool mayReadOrWriteMemory() const {
return mayReadFromMemory() || mayWriteToMemory();
}
-
- /// Returns true if the recipe only uses the first lane of operand \p Op.
- /// Conservatively returns false.
- virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return false;
- }
};
inline bool VPUser::classof(const VPDef *Def) {
@@ -804,6 +785,7 @@ public:
CanonicalIVIncrement,
CanonicalIVIncrementNUW,
BranchOnCount,
+ BranchOnCond
};
private:
@@ -892,6 +874,7 @@ public:
case Instruction::Unreachable:
case Instruction::Fence:
case Instruction::AtomicRMW:
+ case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
return false;
default:
@@ -1049,27 +1032,25 @@ public:
};
/// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector and scalar values.
+/// producing their vector values.
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
const InductionDescriptor &IndDesc;
- bool NeedsScalarIV;
bool NeedsVectorIV;
public:
- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
const InductionDescriptor &IndDesc,
- bool NeedsScalarIV, bool NeedsVectorIV)
- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
- IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
+ bool NeedsVectorIV)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}),
+ VPValue(IV, this), IV(IV), IndDesc(IndDesc),
NeedsVectorIV(NeedsVectorIV) {}
- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
const InductionDescriptor &IndDesc,
- TruncInst *Trunc, bool NeedsScalarIV,
- bool NeedsVectorIV)
- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
- IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
+ TruncInst *Trunc, bool NeedsVectorIV)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}),
+ VPValue(Trunc, this), IV(IV), IndDesc(IndDesc),
NeedsVectorIV(NeedsVectorIV) {}
~VPWidenIntOrFpInductionRecipe() override = default;
@@ -1093,6 +1074,10 @@ public:
VPValue *getStartValue() { return getOperand(0); }
const VPValue *getStartValue() const { return getOperand(0); }
+ /// Returns the step value of the induction.
+ VPValue *getStepValue() { return getOperand(1); }
+ const VPValue *getStepValue() const { return getOperand(1); }
+
/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
TruncInst *getTruncInst() {
@@ -1102,6 +1087,8 @@ public:
return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
}
+ PHINode *getPHINode() { return IV; }
+
/// Returns the induction descriptor for the recipe.
const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
@@ -1115,9 +1102,6 @@ public:
return TruncI ? TruncI->getType() : IV->getType();
}
- /// Returns true if a scalar phi needs to be created for the induction.
- bool needsScalarIV() const { return NeedsScalarIV; }
-
/// Returns true if a vector phi needs to be created for the induction.
bool needsVectorIV() const { return NeedsVectorIV; }
};
@@ -1167,6 +1151,9 @@ public:
VPValue *getStartValue() {
return getNumOperands() == 0 ? nullptr : getOperand(0);
}
+ VPValue *getStartValue() const {
+ return getNumOperands() == 0 ? nullptr : getOperand(0);
+ }
/// Returns the incoming value from the loop backedge.
VPValue *getBackedgeValue() {
@@ -1180,6 +1167,52 @@ public:
}
};
+class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
+ const InductionDescriptor &IndDesc;
+
+ /// SCEV used to expand step.
+ /// FIXME: move expansion of step to the pre-header, once it is modeled
+ /// explicitly.
+ ScalarEvolution &SE;
+
+public:
+ /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
+ /// Start.
+ VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start,
+ const InductionDescriptor &IndDesc,
+ ScalarEvolution &SE)
+ : VPHeaderPHIRecipe(VPVWidenPointerInductionSC, VPWidenPointerInductionSC,
+ Phi),
+ IndDesc(IndDesc), SE(SE) {
+ addOperand(Start);
+ }
+
+ ~VPWidenPointerInductionRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *B) {
+ return B->getVPDefID() == VPRecipeBase::VPWidenPointerInductionSC;
+ }
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPRecipeBase::VPWidenPointerInductionSC;
+ }
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVWidenPointerInductionSC;
+ }
+
+ /// Generate vector values for the pointer induction.
+ void execute(VPTransformState &State) override;
+
+ /// Returns true if only scalar values will be generated.
+ bool onlyScalarsGenerated(ElementCount VF);
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for handling header phis that are widened in the vector loop.
/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
/// managed in the recipe directly.
@@ -1363,9 +1396,8 @@ public:
"Op must be an operand of the recipe");
// Recursing through Blend recipes only, must terminate at header phi's the
// latest.
- return all_of(users(), [this](VPUser *U) {
- return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(this);
- });
+ return all_of(users(),
+ [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
}
};
@@ -1440,6 +1472,15 @@ public:
unsigned getNumStoreOperands() const {
return getNumOperands() - (HasMask ? 2 : 1);
}
+
+ /// The recipe only uses the first lane of the address.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() && all_of(getStoredValues(), [Op](VPValue *StoredV) {
+ return Op != StoredV;
+ });
+ }
};
/// A recipe to represent inloop reduction operations, performing a reduction on
@@ -1551,6 +1592,13 @@ public:
"Op must be an operand of the recipe");
return isUniform();
}
+
+ /// Returns true if the recipe uses scalars of operand \p Op.
+ bool usesScalars(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
};
/// A recipe for generating conditional branches on the bits of a mask.
@@ -1590,6 +1638,13 @@ public:
// Mask is optional.
return getNumOperands() == 1 ? getOperand(0) : nullptr;
}
+
+ /// Returns true if the recipe uses scalars of operand \p Op.
+ bool usesScalars(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
};
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -1619,6 +1674,13 @@ public:
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe uses scalars of operand \p Op.
+ bool usesScalars(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
};
/// A Recipe for widening load/store operations.
@@ -1627,7 +1689,7 @@ public:
/// - For store: Address, stored value, optional mask
/// TODO: We currently execute only per-part unless a specific instance is
/// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
Instruction &Ingredient;
// Whether the loaded-from / stored-to addresses are consecutive.
@@ -1649,10 +1711,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
public:
VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
bool Consecutive, bool Reverse)
- : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}),
- VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load),
+ : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
Consecutive(Consecutive), Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+ new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
setMask(Mask);
}
@@ -1660,7 +1722,6 @@ public:
VPValue *StoredValue, VPValue *Mask,
bool Consecutive, bool Reverse)
: VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
- VPValue(VPValue::VPVMemoryInstructionSC, &Store, this),
Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
setMask(Mask);
@@ -1714,9 +1775,42 @@ public:
"Op must be an operand of the recipe");
// Widened, consecutive memory operations only demand the first lane of
- // their address.
- return Op == getAddr() && isConsecutive();
+ // their address, unless the same operand is also stored. That latter can
+ // happen with opaque pointers.
+ return Op == getAddr() && isConsecutive() &&
+ (!isStore() || Op != getStoredValue());
+ }
+
+ Instruction &getIngredient() const { return Ingredient; }
+};
+
+/// Recipe to expand a SCEV expression.
+class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue {
+ const SCEV *Expr;
+ ScalarEvolution &SE;
+
+public:
+ VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
+ : VPRecipeBase(VPExpandSCEVSC, {}), VPValue(nullptr, this), Expr(Expr),
+ SE(SE) {}
+
+ ~VPExpandSCEVRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPExpandSCEVSC;
}
+
+ /// Generate a canonical vector induction variable of the vector loop, with
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ const SCEV *getSCEV() const { return Expr; }
};
/// Canonical scalar induction phi of the vector loop. Starting at the specified
@@ -1738,6 +1832,12 @@ public:
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPCanonicalIVPHISC;
}
+ static inline bool classof(const VPHeaderPHIRecipe *D) {
+ return D->getVPDefID() == VPCanonicalIVPHISC;
+ }
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC;
+ }
/// Generate the canonical scalar induction phi of the vector loop.
void execute(VPTransformState &State) override;
@@ -1803,6 +1903,64 @@ public:
}
};
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their scalar values.
+class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue {
+ /// Scalar type to use for the generated values.
+ Type *Ty;
+ /// If not nullptr, truncate the generated values to TruncToTy.
+ Type *TruncToTy;
+ const InductionDescriptor &IndDesc;
+
+public:
+ VPScalarIVStepsRecipe(Type *Ty, const InductionDescriptor &IndDesc,
+ VPValue *CanonicalIV, VPValue *Start, VPValue *Step,
+ Type *TruncToTy)
+ : VPRecipeBase(VPScalarIVStepsSC, {CanonicalIV, Start, Step}),
+ VPValue(nullptr, this), Ty(Ty), TruncToTy(TruncToTy), IndDesc(IndDesc) {
+ }
+
+ ~VPScalarIVStepsRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
+ }
+ /// Extra classof implementations to allow directly casting from VPUser ->
+ /// VPScalarIVStepsRecipe.
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
+ }
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
+ }
+
+ /// Generate the scalarized versions of the phi node as needed by their users.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the induction is canonical, i.e. starting at 0 and
+ /// incremented by UF * VF (= the original IV is incremented by 1).
+ bool isCanonical() const;
+
+ VPCanonicalIVPHIRecipe *getCanonicalIV() const;
+ VPValue *getStartValue() const { return getOperand(1); }
+ VPValue *getStepValue() const { return getOperand(2); }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+};
+
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
/// holds a sequence of zero or more VPRecipe's each representing a sequence of
/// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
@@ -1895,6 +2053,8 @@ public:
/// SplitAt to the new block. Returns the new block.
VPBasicBlock *splitAt(iterator SplitAt);
+ VPRegionBlock *getEnclosingLoopRegion();
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
/// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
@@ -1906,6 +2066,14 @@ public:
using VPBlockBase::print; // Get the print(raw_stream &O) version.
#endif
+ /// If the block has multiple successors, return the branch recipe terminating
+ /// the block. If there are no or only a single successor, return nullptr;
+ VPRecipeBase *getTerminator();
+ const VPRecipeBase *getTerminator() const;
+
+ /// Returns true if the block is exiting it's parent region.
+ bool isExiting() const;
+
private:
/// Create an IR BasicBlock to hold the output instructions generated by this
/// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1913,7 +2081,7 @@ private:
};
/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
-/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
/// A VPRegionBlock may indicate that its contents are to be replicated several
/// times. This is designed to support predicated scalarization, in which a
/// scalar if-then code structure needs to be generated VF * UF times. Having
@@ -1924,25 +2092,26 @@ class VPRegionBlock : public VPBlockBase {
/// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
VPBlockBase *Entry;
- /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
- VPBlockBase *Exit;
+ /// Hold the Single Exiting block of the SESE region modelled by the
+ /// VPRegionBlock.
+ VPBlockBase *Exiting;
/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
bool IsReplicator;
public:
- VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
const std::string &Name = "", bool IsReplicator = false)
- : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
IsReplicator(IsReplicator) {
assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
- assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+ assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
Entry->setParent(this);
- Exit->setParent(this);
+ Exiting->setParent(this);
}
VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
- : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
IsReplicator(IsReplicator) {}
~VPRegionBlock() override {
@@ -1976,16 +2145,22 @@ public:
// DominatorTreeBase representing the Graph type.
VPBlockBase &front() const { return *Entry; }
- const VPBlockBase *getExit() const { return Exit; }
- VPBlockBase *getExit() { return Exit; }
+ const VPBlockBase *getExiting() const { return Exiting; }
+ VPBlockBase *getExiting() { return Exiting; }
- /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
- /// ExitBlock must have no successors.
- void setExit(VPBlockBase *ExitBlock) {
- assert(ExitBlock->getSuccessors().empty() &&
+ /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
+ /// ExitingBlock must have no successors.
+ void setExiting(VPBlockBase *ExitingBlock) {
+ assert(ExitingBlock->getSuccessors().empty() &&
"Exit block cannot have successors.");
- Exit = ExitBlock;
- ExitBlock->setParent(this);
+ Exiting = ExitingBlock;
+ ExitingBlock->setParent(this);
+ }
+
+ /// Returns the pre-header VPBasicBlock of the loop region.
+ VPBasicBlock *getPreheaderVPBB() {
+ assert(!isReplicator() && "should only get pre-header of loop regions");
+ return getSinglePredecessor()->getExitingBasicBlock();
}
/// An indicator whether this region is to generate multiple replicated
@@ -2119,11 +2294,11 @@ struct GraphTraits<Inverse<VPRegionBlock *>>
using nodes_iterator = df_iterator<NodeRef>;
static NodeRef getEntryNode(Inverse<GraphRef> N) {
- return N.Graph->getExit();
+ return N.Graph->getExiting();
}
static nodes_iterator nodes_begin(GraphRef N) {
- return nodes_iterator::begin(N->getExit());
+ return nodes_iterator::begin(N->getExiting());
}
static nodes_iterator nodes_end(GraphRef N) {
@@ -2281,12 +2456,9 @@ class VPlan {
/// Holds the name of the VPlan, for printing.
std::string Name;
- /// Holds all the external definitions created for this VPlan.
- // TODO: Introduce a specific representation for external definitions in
- // VPlan. External definitions must be immutable and hold a pointer to its
- // underlying IR that will be used to implement its structural comparison
- // (operators '==' and '<').
- SetVector<VPValue *> VPExternalDefs;
+ /// Holds all the external definitions created for this VPlan. External
+ /// definitions must be immutable and hold a pointer to their underlying IR.
+ DenseMap<Value *, VPValue *> VPExternalDefs;
/// Represents the trip count of the original loop, for folding
/// the tail.
@@ -2307,13 +2479,13 @@ class VPlan {
/// to be free when the plan's destructor is called.
SmallVector<VPValue *, 16> VPValuesToFree;
- /// Holds the VPLoopInfo analysis for this VPlan.
- VPLoopInfo VPLInfo;
-
/// Indicates whether it is safe use the Value2VPValue mapping or if the
/// mapping cannot be used any longer, because it is stale.
bool Value2VPValueEnabled = true;
+ /// Values used outside the plan.
+ MapVector<PHINode *, VPLiveOut *> LiveOuts;
+
public:
VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
if (Entry)
@@ -2321,6 +2493,8 @@ public:
}
~VPlan() {
+ clearLiveOuts();
+
if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : depth_first(Entry))
@@ -2334,13 +2508,14 @@ public:
delete TripCount;
if (BackedgeTakenCount)
delete BackedgeTakenCount;
- for (VPValue *Def : VPExternalDefs)
- delete Def;
+ for (auto &P : VPExternalDefs)
+ delete P.second;
}
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
- Value *CanonicalIVStartValue, VPTransformState &State);
+ Value *CanonicalIVStartValue, VPTransformState &State,
+ bool IsEpilogueVectorization);
/// Generate the IR code for this VPlan.
void execute(struct VPTransformState *State);
@@ -2383,9 +2558,13 @@ public:
void setName(const Twine &newName) { Name = newName.str(); }
- /// Add \p VPVal to the pool of external definitions if it's not already
- /// in the pool.
- void addExternalDef(VPValue *VPVal) { VPExternalDefs.insert(VPVal); }
+ /// Get the existing or add a new external definition for \p V.
+ VPValue *getOrAddExternalDef(Value *V) {
+ auto I = VPExternalDefs.insert({V, nullptr});
+ if (I.second)
+ I.first->second = new VPValue(V);
+ return I.first->second;
+ }
void addVPValue(Value *V) {
assert(Value2VPValueEnabled &&
@@ -2432,10 +2611,6 @@ public:
Value2VPValue.erase(V);
}
- /// Return the VPLoopInfo analysis for this VPlan.
- VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
- const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPlan to \p O.
void print(raw_ostream &O) const;
@@ -2465,7 +2640,10 @@ public:
/// Returns the VPRegionBlock of the vector loop.
VPRegionBlock *getVectorLoopRegion() {
- return cast<VPRegionBlock>(getEntry());
+ return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
+ }
+ const VPRegionBlock *getVectorLoopRegion() const {
+ return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
}
/// Returns the canonical induction recipe of the vector loop.
@@ -2478,6 +2656,23 @@ public:
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}
+ void addLiveOut(PHINode *PN, VPValue *V);
+
+ void clearLiveOuts() {
+ for (auto &KV : LiveOuts)
+ delete KV.second;
+ LiveOuts.clear();
+ }
+
+ void removeLiveOut(PHINode *PN) {
+ delete LiveOuts[PN];
+ LiveOuts.erase(PN);
+ }
+
+ const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
+ return LiveOuts;
+ }
+
private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
@@ -2567,9 +2762,8 @@ public:
/// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
/// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
/// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
- /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's
- /// conditional bit is propagated to \p NewBlock. \p NewBlock must have
- /// neither successors nor predecessors.
+ /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
+ /// have neither successors nor predecessors.
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
assert(NewBlock->getSuccessors().empty() &&
NewBlock->getPredecessors().empty() &&
@@ -2580,24 +2774,22 @@ public:
disconnectBlocks(BlockPtr, Succ);
connectBlocks(NewBlock, Succ);
}
- NewBlock->setCondBit(BlockPtr->getCondBit());
- BlockPtr->setCondBit(nullptr);
connectBlocks(BlockPtr, NewBlock);
}
/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
/// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
/// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
- /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
- /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
- /// must have neither successors nor predecessors.
+ /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors
+ /// and \p IfTrue and \p IfFalse must have neither successors nor
+ /// predecessors.
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
- VPValue *Condition, VPBlockBase *BlockPtr) {
+ VPBlockBase *BlockPtr) {
assert(IfTrue->getSuccessors().empty() &&
"Can't insert IfTrue with successors.");
assert(IfFalse->getSuccessors().empty() &&
"Can't insert IfFalse with successors.");
- BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+ BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
IfTrue->setPredecessors({BlockPtr});
IfFalse->setPredecessors({BlockPtr});
IfTrue->setParent(BlockPtr->getParent());
@@ -2639,8 +2831,8 @@ public:
R.moveBefore(*PredVPBB, PredVPBB->end());
VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());
- if (ParentRegion->getExit() == Block)
- ParentRegion->setExit(PredVPBB);
+ if (ParentRegion->getExiting() == Block)
+ ParentRegion->setExiting(PredVPBB);
SmallVector<VPBlockBase *> Successors(Block->successors());
for (auto *Succ : Successors) {
VPBlockUtils::disconnectBlocks(Block, Succ);
@@ -2650,41 +2842,6 @@ public:
return PredVPBB;
}
- /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
- static bool isBackEdge(const VPBlockBase *FromBlock,
- const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
- assert(FromBlock->getParent() == ToBlock->getParent() &&
- FromBlock->getParent() && "Must be in same region");
- const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);
- const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);
- if (!FromLoop || !ToLoop || FromLoop != ToLoop)
- return false;
-
- // A back-edge is a branch from the loop latch to its header.
- return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();
- }
-
- /// Returns true if \p Block is a loop latch
- static bool blockIsLoopLatch(const VPBlockBase *Block,
- const VPLoopInfo *VPLInfo) {
- if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))
- return ParentVPL->isLoopLatch(Block);
-
- return false;
- }
-
- /// Count and return the number of succesors of \p PredBlock excluding any
- /// backedges.
- static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,
- VPLoopInfo *VPLI) {
- unsigned Count = 0;
- for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {
- if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))
- Count++;
- }
- return Count;
- }
-
/// Return an iterator range over \p Range which only includes \p BlockTy
/// blocks. The accesses are casted to \p BlockTy.
template <typename BlockTy, typename T>
@@ -2845,6 +3002,13 @@ namespace vputils {
/// Returns true if only the first lane of \p Def is used.
bool onlyFirstLaneUsed(VPValue *Def);
+/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
+/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
+/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
+/// pre-header already contains a recipe expanding \p Expr, return it. If not,
+/// create a new one.
+VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
+ ScalarEvolution &SE);
} // end namespace vputils
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 379988733312..84b0dac862b6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -42,9 +42,6 @@ private:
// Vectorization plan that we are working on.
VPlan &Plan;
- // Output Top Region.
- VPRegionBlock *TopRegion = nullptr;
-
// Builder of the VPlan instruction-level representation.
VPBuilder VPIRBuilder;
@@ -59,6 +56,9 @@ private:
// Hold phi node's that need to be fixed once the plain CFG has been built.
SmallVector<PHINode *, 8> PhisToFix;
+ /// Maps loops in the original IR to their corresponding region.
+ DenseMap<Loop *, VPRegionBlock *> Loop2Region;
+
// Utility functions.
void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
void fixPhiNodes();
@@ -73,8 +73,9 @@ public:
PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
: TheLoop(Lp), LI(LI), Plan(P) {}
- // Build the plain CFG and return its Top Region.
- VPRegionBlock *buildPlainCFG();
+ /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected
+ /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG.
+ VPBasicBlock *buildPlainCFG();
};
} // anonymous namespace
@@ -106,19 +107,32 @@ void PlainCFGBuilder::fixPhiNodes() {
}
}
-// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
-// existing one if it was already created.
+// Create a new empty VPBasicBlock for an incoming BasicBlock in the region
+// corresponding to the containing loop or retrieve an existing one if it was
+// already created. If no region exists yet for the loop containing \p BB, a new
+// one is created.
VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
auto BlockIt = BB2VPBB.find(BB);
if (BlockIt != BB2VPBB.end())
// Retrieve existing VPBB.
return BlockIt->second;
+ // Get or create a region for the loop containing BB.
+ Loop *CurrentLoop = LI->getLoopFor(BB);
+ VPRegionBlock *ParentR = nullptr;
+ if (CurrentLoop) {
+ auto Iter = Loop2Region.insert({CurrentLoop, nullptr});
+ if (Iter.second)
+ Iter.first->second = new VPRegionBlock(
+ CurrentLoop->getHeader()->getName().str(), false /*isReplicator*/);
+ ParentR = Iter.first->second;
+ }
+
// Create new VPBB.
LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
BB2VPBB[BB] = VPBB;
- VPBB->setParent(TopRegion);
+ VPBB->setParent(ParentR);
return VPBB;
}
@@ -182,8 +196,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
// A and B: Create VPValue and add it to the pool of external definitions and
// to the Value->VPValue map.
- VPValue *NewVPVal = new VPValue(IRVal);
- Plan.addExternalDef(NewVPVal);
+ VPValue *NewVPVal = Plan.getOrAddExternalDef(IRVal);
IRDef2VPValue[IRVal] = NewVPVal;
return NewVPVal;
}
@@ -203,10 +216,13 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
"Instruction shouldn't have been visited.");
if (auto *Br = dyn_cast<BranchInst>(Inst)) {
- // Branch instruction is not explicitly represented in VPlan but we need
- // to represent its condition bit when it's conditional.
- if (Br->isConditional())
- getOrCreateVPOperand(Br->getCondition());
+ // Conditional branch instruction are represented using BranchOnCond
+ // recipes.
+ if (Br->isConditional()) {
+ VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
+ VPBB->appendRecipe(
+ new VPInstruction(VPInstruction::BranchOnCond, {Cond}));
+ }
// Skip the rest of the Instruction processing for Branch instructions.
continue;
@@ -238,11 +254,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
- // 1. Create the Top Region. It will be the parent of all VPBBs.
- TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
-
- // 2. Scan the body of the loop in a topological order to visit each basic
+VPBasicBlock *PlainCFGBuilder::buildPlainCFG() {
+ // 1. Scan the body of the loop in a topological order to visit each basic
// block after having visited its predecessor basic blocks. Create a VPBB for
// each BB and link it to its successor and predecessor VPBBs. Note that
// predecessors must be set in the same order as they are in the incomming IR.
@@ -251,21 +264,20 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
// Loop PH needs to be explicitly visited since it's not taken into account by
// LoopBlocksDFS.
- BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
- assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+ BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
+ assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
"Unexpected loop preheader");
- VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
- for (auto &I : *PreheaderBB) {
+ VPBasicBlock *ThePreheaderVPBB = getOrCreateVPBB(ThePreheaderBB);
+ ThePreheaderVPBB->setName("vector.ph");
+ for (auto &I : *ThePreheaderBB) {
if (I.getType()->isVoidTy())
continue;
- VPValue *VPV = new VPValue(&I);
- Plan.addExternalDef(VPV);
- IRDef2VPValue[&I] = VPV;
+ IRDef2VPValue[&I] = Plan.getOrAddExternalDef(&I);
}
// Create empty VPBB for Loop H so that we can link PH->H.
VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
- // Preheader's predecessors will be set during the loop RPO traversal below.
- PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+ HeaderVPBB->setName("vector.body");
+ ThePreheaderVPBB->setOneSuccessor(HeaderVPBB);
LoopBlocksRPO RPO(TheLoop);
RPO.perform(LI);
@@ -295,16 +307,13 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
// Get VPBB's condition bit.
assert(isa<BranchInst>(TI) && "Unsupported terminator!");
- auto *Br = cast<BranchInst>(TI);
- Value *BrCond = Br->getCondition();
// Look up the branch condition to get the corresponding VPValue
// representing the condition bit in VPlan (which may be in another VPBB).
- assert(IRDef2VPValue.count(BrCond) &&
+ assert(IRDef2VPValue.count(cast<BranchInst>(TI)->getCondition()) &&
"Missing condition bit in IRDef2VPValue!");
- VPValue *VPCondBit = IRDef2VPValue[BrCond];
- // Link successors using condition bit.
- VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+ // Link successors.
+ VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1);
} else
llvm_unreachable("Number of successors not supported.");
@@ -312,30 +321,61 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
setVPBBPredsFromBB(VPBB, BB);
}
- // 3. Process outermost loop exit. We created an empty VPBB for the loop
+ // 2. Process outermost loop exit. We created an empty VPBB for the loop
// single exit BB during the RPO traversal of the loop body but Instructions
// weren't visited because it's not part of the the loop.
BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
assert(LoopExitBB && "Loops with multiple exits are not supported.");
VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
- createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
// Loop exit was already set as successor of the loop exiting BB.
// We only set its predecessor VPBB now.
setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+ // 3. Fix up region blocks for loops. For each loop,
+ // * use the header block as entry to the corresponding region,
+ // * use the latch block as exit of the corresponding region,
+ // * set the region as successor of the loop pre-header, and
+ // * set the exit block as successor to the region.
+ SmallVector<Loop *> LoopWorkList;
+ LoopWorkList.push_back(TheLoop);
+ while (!LoopWorkList.empty()) {
+ Loop *L = LoopWorkList.pop_back_val();
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Exiting = L->getLoopLatch();
+ assert(Exiting == L->getExitingBlock() &&
+ "Latch must be the only exiting block");
+ VPRegionBlock *Region = Loop2Region[L];
+ VPBasicBlock *HeaderVPBB = getOrCreateVPBB(Header);
+ VPBasicBlock *ExitingVPBB = getOrCreateVPBB(Exiting);
+
+ // Disconnect backedge and pre-header from header.
+ VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(L->getLoopPreheader());
+ VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB);
+ VPBlockUtils::disconnectBlocks(ExitingVPBB, HeaderVPBB);
+
+ Region->setParent(PreheaderVPBB->getParent());
+ Region->setEntry(HeaderVPBB);
+ VPBlockUtils::connectBlocks(PreheaderVPBB, Region);
+
+ // Disconnect exit block from exiting (=latch) block, set exiting block and
+ // connect region to exit block.
+ VPBasicBlock *ExitVPBB = getOrCreateVPBB(L->getExitBlock());
+ VPBlockUtils::disconnectBlocks(ExitingVPBB, ExitVPBB);
+ Region->setExiting(ExitingVPBB);
+ VPBlockUtils::connectBlocks(Region, ExitVPBB);
+
+ // Queue sub-loops for processing.
+ LoopWorkList.append(L->begin(), L->end());
+ }
// 4. The whole CFG has been built at this point so all the input Values must
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
// VPlan operands.
fixPhiNodes();
- // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
- // Top Region entry and exit.
- TopRegion->setEntry(PreheaderVPBB);
- TopRegion->setExit(LoopExitVPBB);
- return TopRegion;
+ return ThePreheaderVPBB;
}
-VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
+VPBasicBlock *VPlanHCFGBuilder::buildPlainCFG() {
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
return PCFGBuilder.buildPlainCFG();
}
@@ -343,20 +383,15 @@ VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
// Public interface to build a H-CFG.
void VPlanHCFGBuilder::buildHierarchicalCFG() {
// Build Top Region enclosing the plain CFG and set it as VPlan entry.
- VPRegionBlock *TopRegion = buildPlainCFG();
- Plan.setEntry(TopRegion);
+ VPBasicBlock *EntryVPBB = buildPlainCFG();
+ Plan.setEntry(EntryVPBB);
LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
Verifier.verifyHierarchicalCFG(TopRegion);
// Compute plain CFG dom tree for VPLInfo.
VPDomTree.recalculate(*TopRegion);
LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n";
VPDomTree.print(dbgs()));
-
- // Compute VPLInfo and keep it in Plan.
- VPLoopInfo &VPLInfo = Plan.getVPLoopInfo();
- VPLInfo.analyze(VPDomTree);
- LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n";
- VPLInfo.print(dbgs()));
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 238ee7e6347c..2d52990af268 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -24,13 +24,15 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
-#include "VPlan.h"
#include "VPlanDominatorTree.h"
#include "VPlanVerifier.h"
namespace llvm {
class Loop;
+class LoopInfo;
+class VPRegionBlock;
+class VPlan;
class VPlanTestBase;
/// Main class to build the VPlan H-CFG for an incoming IR.
@@ -55,9 +57,9 @@ private:
// are introduced.
VPDominatorTree VPDomTree;
- /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion)
- /// enclosing the plain CFG.
- VPRegionBlock *buildPlainCFG();
+ /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected
+ /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG.
+ VPBasicBlock *buildPlainCFG();
public:
VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
deleted file mode 100644
index 5208f2d58e2b..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a
-/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization
-/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further
-/// information can be found in VectorizationPlanner.rst.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
-
-#include "llvm/Analysis/LoopInfoImpl.h"
-
-namespace llvm {
-class VPBlockBase;
-
-/// Hold analysis information for every loop detected by VPLoopInfo. It is an
-/// instantiation of LoopBase.
-class VPLoop : public LoopBase<VPBlockBase, VPLoop> {
-private:
- friend class LoopInfoBase<VPBlockBase, VPLoop>;
- explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {}
-};
-
-/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based
-/// Hierarchical CFG. It is a specialization of LoopInfoBase class.
-// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which
-// is the same as the incoming IR CFG. If it's more efficient than running the
-// whole loop detection algorithm, we may want to create a mechanism to
-// translate LoopInfo into VPLoopInfo. However, that would require significant
-// changes in LoopInfoBase class.
-typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo;
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
deleted file mode 100644
index e879a33db6ee..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the VPlanPredicator class which contains the public
-/// interfaces to predicate and linearize the VPlan region.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanPredicator.h"
-#include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "VPlanPredicator"
-
-using namespace llvm;
-
-// Generate VPInstructions at the beginning of CurrBB that calculate the
-// predicate being propagated from PredBB to CurrBB depending on the edge type
-// between them. For example if:
-// i. PredBB is controlled by predicate %BP, and
-// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition
-// bit value %CBV then this function will generate the following two
-// VPInstructions at the start of CurrBB:
-// %IntermediateVal = not %CBV
-// %FinalVal = and %BP %IntermediateVal
-// It returns %FinalVal.
-VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
- VPBasicBlock *CurrBB) {
- VPValue *CBV = PredBB->getCondBit();
-
- // Set the intermediate value - this is either 'CBV', or 'not CBV'
- // depending on the edge type.
- EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);
- VPValue *IntermediateVal = nullptr;
- switch (ET) {
- case EdgeType::TRUE_EDGE:
- // CurrBB is the true successor of PredBB - nothing to do here.
- IntermediateVal = CBV;
- break;
-
- case EdgeType::FALSE_EDGE:
- // CurrBB is the False successor of PredBB - compute not of CBV.
- IntermediateVal = Builder.createNot(CBV, {});
- break;
- }
-
- // Now AND intermediate value with PredBB's block predicate if it has one.
- VPValue *BP = PredBB->getPredicate();
- if (BP)
- return Builder.createAnd(BP, IntermediateVal, {});
- else
- return IntermediateVal;
-}
-
-// Generate a tree of ORs for all IncomingPredicates in WorkList.
-// Note: This function destroys the original Worklist.
-//
-// P1 P2 P3 P4 P5
-// \ / \ / /
-// OR1 OR2 /
-// \ | /
-// \ +/-+
-// \ / |
-// OR3 |
-// \ |
-// OR4 <- Returns this
-// |
-//
-// The algorithm uses a worklist of predicates as its main data structure.
-// We pop a pair of values from the front (e.g. P1 and P2), generate an OR
-// (in this example OR1), and push it back. In this example the worklist
-// contains {P3, P4, P5, OR1}.
-// The process iterates until we have only one element in the Worklist (OR4).
-// The last element is the root predicate which is returned.
-VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
- if (Worklist.empty())
- return nullptr;
-
- // The worklist initially contains all the leaf nodes. Initialize the tree
- // using them.
- while (Worklist.size() >= 2) {
- // Pop a pair of values from the front.
- VPValue *LHS = Worklist.front();
- Worklist.pop_front();
- VPValue *RHS = Worklist.front();
- Worklist.pop_front();
-
- // Create an OR of these values.
- VPValue *Or = Builder.createOr(LHS, RHS, {});
-
- // Push OR to the back of the worklist.
- Worklist.push_back(Or);
- }
-
- assert(Worklist.size() == 1 && "Expected 1 item in worklist");
-
- // The root is the last node in the worklist.
- VPValue *Root = Worklist.front();
-
- // This root needs to replace the existing block predicate. This is done in
- // the caller function.
- return Root;
-}
-
-// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE
-VPlanPredicator::EdgeType
-VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,
- VPBlockBase *ToBlock) {
- unsigned Count = 0;
- for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {
- if (SuccBlock == ToBlock) {
- assert(Count < 2 && "Switch not supported currently");
- return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;
- }
- Count++;
- }
-
- llvm_unreachable("Broken getEdgeTypeBetween");
-}
-
-// Generate all predicates needed for CurrBlock by going through its immediate
-// predecessor blocks.
-void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,
- VPRegionBlock *Region) {
- // Blocks that dominate region exit inherit the predicate from the region.
- // Return after setting the predicate.
- if (VPDomTree.dominates(CurrBlock, Region->getExit())) {
- VPValue *RegionBP = Region->getPredicate();
- CurrBlock->setPredicate(RegionBP);
- return;
- }
-
- // Collect all incoming predicates in a worklist.
- std::list<VPValue *> IncomingPredicates;
-
- // Set the builder's insertion point to the top of the current BB
- VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());
- Builder.setInsertPoint(CurrBB, CurrBB->begin());
-
- // For each predecessor, generate the VPInstructions required for
- // computing 'BP AND (not) CBV" at the top of CurrBB.
- // Collect the outcome of this calculation for all predecessors
- // into IncomingPredicates.
- for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {
- // Skip back-edges
- if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))
- continue;
-
- VPValue *IncomingPredicate = nullptr;
- unsigned NumPredSuccsNoBE =
- VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);
-
- // If there is an unconditional branch to the currBB, then we don't create
- // edge predicates. We use the predecessor's block predicate instead.
- if (NumPredSuccsNoBE == 1)
- IncomingPredicate = PredBlock->getPredicate();
- else if (NumPredSuccsNoBE == 2) {
- // Emit recipes into CurrBlock if required
- assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");
- IncomingPredicate =
- getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);
- } else
- llvm_unreachable("FIXME: switch statement ?");
-
- if (IncomingPredicate)
- IncomingPredicates.push_back(IncomingPredicate);
- }
-
- // Logically OR all incoming predicates by building the Predicate Tree.
- VPValue *Predicate = genPredicateTree(IncomingPredicates);
-
- // Now update the block's predicate with the new one.
- CurrBlock->setPredicate(Predicate);
-}
-
-// Generate all predicates needed for Region.
-void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
- VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());
- ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);
-
- // Generate edge predicates and append them to the block predicate. RPO is
- // necessary since the predecessor blocks' block predicate needs to be set
- // before the current block's block predicate can be computed.
- for (VPBlockBase *Block : RPOT) {
- // TODO: Handle nested regions once we start generating the same.
- assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
- createOrPropagatePredicates(Block, Region);
- }
-}
-
-// Linearize the CFG within Region.
-// TODO: Predication and linearization need RPOT for every region.
-// This traversal is expensive. Since predication is not adding new
-// blocks, we should be able to compute RPOT once in predication and
-// reuse it here. This becomes even more important once we have nested
-// regions.
-void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
- ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
- VPBlockBase *PrevBlock = nullptr;
-
- for (VPBlockBase *CurrBlock : RPOT) {
- // TODO: Handle nested regions once we start generating the same.
- assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
-
- // Linearize control flow by adding an unconditional edge between PrevBlock
- // and CurrBlock skipping loop headers and latches to keep intact loop
- // header predecessors and loop latch successors.
- if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&
- !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {
-
- LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"
- << CurrBlock->getName() << "\n");
-
- PrevBlock->clearSuccessors();
- CurrBlock->clearPredecessors();
- VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);
- }
-
- PrevBlock = CurrBlock;
- }
-}
-
-// Entry point. The driver function for the predicator.
-void VPlanPredicator::predicate() {
- // Predicate the blocks within Region.
- predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
-
- // Linearlize the blocks with Region.
- linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
-}
-
-VPlanPredicator::VPlanPredicator(VPlan &Plan)
- : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
- // FIXME: Predicator is currently computing the dominator information for the
- // top region. Once we start storing dominator information in a VPRegionBlock,
- // we can avoid this recalculation.
- VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
deleted file mode 100644
index a5db9a54da3c..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines the VPlanPredicator class which contains the public
-/// interfaces to predicate and linearize the VPlan region.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
-
-#include "LoopVectorizationPlanner.h"
-#include "VPlan.h"
-#include "VPlanDominatorTree.h"
-
-namespace llvm {
-
-class VPlanPredicator {
-private:
- enum class EdgeType {
- TRUE_EDGE,
- FALSE_EDGE,
- };
-
- // VPlan being predicated.
- VPlan &Plan;
-
- // VPLoopInfo for Plan's HCFG.
- VPLoopInfo *VPLI;
-
- // Dominator tree for Plan's HCFG.
- VPDominatorTree VPDomTree;
-
- // VPlan builder used to generate VPInstructions for block predicates.
- VPBuilder Builder;
-
- /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
- /// \p ToBlock is either the unconditional successor or the conditional true
- /// successor of \p FromBlock and FALSE_EDGE otherwise.
- EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);
-
- /// Create and return VPValue corresponding to the predicate for the edge from
- /// \p PredBB to \p CurrentBlock.
- VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);
-
- /// Generate and return the result of ORing all the predicate VPValues in \p
- /// Worklist.
- VPValue *genPredicateTree(std::list<VPValue *> &Worklist);
-
- /// Create or propagate predicate for \p CurrBlock in region \p Region using
- /// predicate(s) of its predecessor(s)
- void createOrPropagatePredicates(VPBlockBase *CurrBlock,
- VPRegionBlock *Region);
-
- /// Predicate the CFG within \p Region.
- void predicateRegionRec(VPRegionBlock *Region);
-
- /// Linearize the CFG within \p Region.
- void linearizeRegionRec(VPRegionBlock *Region);
-
-public:
- VPlanPredicator(VPlan &Plan);
-
- /// Predicate Plan's HCFG.
- void predicate();
-};
-} // end namespace llvm
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
new file mode 100644
index 000000000000..92422b17457c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -0,0 +1,840 @@
+//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains implementations for different VPlan recipes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <cassert>
+
+using namespace llvm;
+
+extern cl::opt<bool> EnableVPlanNativePath;
+
+bool VPRecipeBase::mayWriteToMemory() const {
+ switch (getVPDefID()) {
+ case VPWidenMemoryInstructionSC: {
+ return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
+ }
+ case VPReplicateSC:
+ case VPWidenCallSC:
+ return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
+ ->mayWriteToMemory();
+ case VPBranchOnMaskSC:
+ return false;
+ case VPWidenIntOrFpInductionSC:
+ case VPWidenCanonicalIVSC:
+ case VPWidenPHISC:
+ case VPBlendSC:
+ case VPWidenSC:
+ case VPWidenGEPSC:
+ case VPReductionSC:
+ case VPWidenSelectSC: {
+ const Instruction *I =
+ dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
+ (void)I;
+ assert((!I || !I->mayWriteToMemory()) &&
+ "underlying instruction may write to memory");
+ return false;
+ }
+ default:
+ return true;
+ }
+}
+
+bool VPRecipeBase::mayReadFromMemory() const {
+ switch (getVPDefID()) {
+ case VPWidenMemoryInstructionSC: {
+ return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
+ }
+ case VPReplicateSC:
+ case VPWidenCallSC:
+ return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
+ ->mayReadFromMemory();
+ case VPBranchOnMaskSC:
+ return false;
+ case VPWidenIntOrFpInductionSC:
+ case VPWidenCanonicalIVSC:
+ case VPWidenPHISC:
+ case VPBlendSC:
+ case VPWidenSC:
+ case VPWidenGEPSC:
+ case VPReductionSC:
+ case VPWidenSelectSC: {
+ const Instruction *I =
+ dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
+ (void)I;
+ assert((!I || !I->mayReadFromMemory()) &&
+ "underlying instruction may read from memory");
+ return false;
+ }
+ default:
+ return true;
+ }
+}
+
+bool VPRecipeBase::mayHaveSideEffects() const {
+ switch (getVPDefID()) {
+ case VPWidenIntOrFpInductionSC:
+ case VPWidenPointerInductionSC:
+ case VPWidenCanonicalIVSC:
+ case VPWidenPHISC:
+ case VPBlendSC:
+ case VPWidenSC:
+ case VPWidenGEPSC:
+ case VPReductionSC:
+ case VPWidenSelectSC:
+ case VPScalarIVStepsSC: {
+ const Instruction *I =
+ dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
+ (void)I;
+ assert((!I || !I->mayHaveSideEffects()) &&
+ "underlying instruction has side-effects");
+ return false;
+ }
+ case VPReplicateSC: {
+ auto *R = cast<VPReplicateRecipe>(this);
+ return R->getUnderlyingInstr()->mayHaveSideEffects();
+ }
+ default:
+ return true;
+ }
+}
+
+void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
+ auto Lane = VPLane::getLastLaneForVF(State.VF);
+ VPValue *ExitValue = getOperand(0);
+ if (Plan.isUniformAfterVectorization(ExitValue))
+ Lane = VPLane::getFirstLane();
+ Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
+ State.Builder.GetInsertBlock());
+}
+
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+ assert(!Parent && "Recipe already in some VPBasicBlock");
+ assert(InsertPos->getParent() &&
+ "Insertion position not in any VPBasicBlock");
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::insertBefore(VPBasicBlock &BB,
+ iplist<VPRecipeBase>::iterator I) {
+ assert(!Parent && "Recipe already in some VPBasicBlock");
+ assert(I == BB.end() || I->getParent() == &BB);
+ Parent = &BB;
+ BB.getRecipeList().insert(I, this);
+}
+
+void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
+ assert(!Parent && "Recipe already in some VPBasicBlock");
+ assert(InsertPos->getParent() &&
+ "Insertion position not in any VPBasicBlock");
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::removeFromParent() {
+ assert(getParent() && "Recipe not in any VPBasicBlock");
+ getParent()->getRecipeList().remove(getIterator());
+ Parent = nullptr;
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+ assert(getParent() && "Recipe not in any VPBasicBlock");
+ return getParent()->getRecipeList().erase(getIterator());
+}
+
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+ removeFromParent();
+ insertAfter(InsertPos);
+}
+
+void VPRecipeBase::moveBefore(VPBasicBlock &BB,
+ iplist<VPRecipeBase>::iterator I) {
+ removeFromParent();
+ insertBefore(BB, I);
+}
+
+void VPInstruction::generateInstruction(VPTransformState &State,
+ unsigned Part) {
+ IRBuilderBase &Builder = State.Builder;
+ Builder.SetCurrentDebugLocation(DL);
+
+ if (Instruction::isBinaryOp(getOpcode())) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+ State.set(this, V, Part);
+ return;
+ }
+
+ switch (getOpcode()) {
+ case VPInstruction::Not: {
+ Value *A = State.get(getOperand(0), Part);
+ Value *V = Builder.CreateNot(A);
+ State.set(this, V, Part);
+ break;
+ }
+ case VPInstruction::ICmpULE: {
+ Value *IV = State.get(getOperand(0), Part);
+ Value *TC = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateICmpULE(IV, TC);
+ State.set(this, V, Part);
+ break;
+ }
+ case Instruction::Select: {
+ Value *Cond = State.get(getOperand(0), Part);
+ Value *Op1 = State.get(getOperand(1), Part);
+ Value *Op2 = State.get(getOperand(2), Part);
+ Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+ State.set(this, V, Part);
+ break;
+ }
+ case VPInstruction::ActiveLaneMask: {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+ // Get the original loop tripcount.
+ Value *ScalarTC = State.get(getOperand(1), Part);
+
+ auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+ auto *PredTy = VectorType::get(Int1Ty, State.VF);
+ Instruction *Call = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
+ {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
+ State.set(this, Call, Part);
+ break;
+ }
+ case VPInstruction::FirstOrderRecurrenceSplice: {
+ // Generate code to combine the previous and current values in vector v3.
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+
+ // For the first part, use the recurrence phi (v1), otherwise v2.
+ auto *V1 = State.get(getOperand(0), 0);
+ Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
+ if (!PartMinus1->getType()->isVectorTy()) {
+ State.set(this, PartMinus1, Part);
+ } else {
+ Value *V2 = State.get(getOperand(1), Part);
+ State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
+ }
+ break;
+ }
+ case VPInstruction::CanonicalIVIncrement:
+ case VPInstruction::CanonicalIVIncrementNUW: {
+ Value *Next = nullptr;
+ if (Part == 0) {
+ bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
+ auto *Phi = State.get(getOperand(0), 0);
+ // The loop step is equal to the vectorization factor (num of SIMD
+ // elements) times the unroll factor (num of SIMD instructions).
+ Value *Step =
+ createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+ Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
+ } else {
+ Next = State.get(this, 0);
+ }
+
+ State.set(this, Next, Part);
+ break;
+ }
+ case VPInstruction::BranchOnCond: {
+ if (Part != 0)
+ break;
+
+ Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
+ VPRegionBlock *ParentRegion = getParent()->getParent();
+ VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
+
+ // Replace the temporary unreachable terminator with a new conditional
+ // branch, hooking it up to backward destination for exiting blocks now and
+ // to forward destination(s) later when they are created.
+ BranchInst *CondBr =
+ Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
+
+ if (getParent()->isExiting())
+ CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
+
+ CondBr->setSuccessor(0, nullptr);
+ Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+ break;
+ }
+ case VPInstruction::BranchOnCount: {
+ if (Part != 0)
+ break;
+ // First create the compare.
+ Value *IV = State.get(getOperand(0), Part);
+ Value *TC = State.get(getOperand(1), Part);
+ Value *Cond = Builder.CreateICmpEQ(IV, TC);
+
+ // Now create the branch.
+ auto *Plan = getParent()->getPlan();
+ VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
+
+ // Replace the temporary unreachable terminator with a new conditional
+ // branch, hooking it up to backward destination (the header) now and to the
+ // forward destination (the exit/middle block) later when it is created.
+ // Note that CreateCondBr expects a valid BB as first argument, so we need
+ // to set it to nullptr later.
+ BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
+ State.CFG.VPBB2IRBB[Header]);
+ CondBr->setSuccessor(0, nullptr);
+ Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported opcode for instruction");
+ }
+}
+
+void VPInstruction::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPInstruction executing an Instance");
+ IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
+ State.Builder.setFastMathFlags(FMF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ generateInstruction(State, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInstruction::dump() const {
+ VPSlotTracker SlotTracker(getParent()->getPlan());
+ print(dbgs(), "", SlotTracker);
+}
+
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+
+ if (hasResult()) {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+
+ switch (getOpcode()) {
+ case VPInstruction::Not:
+ O << "not";
+ break;
+ case VPInstruction::ICmpULE:
+ O << "icmp ule";
+ break;
+ case VPInstruction::SLPLoad:
+ O << "combined load";
+ break;
+ case VPInstruction::SLPStore:
+ O << "combined store";
+ break;
+ case VPInstruction::ActiveLaneMask:
+ O << "active lane mask";
+ break;
+ case VPInstruction::FirstOrderRecurrenceSplice:
+ O << "first-order splice";
+ break;
+ case VPInstruction::CanonicalIVIncrement:
+ O << "VF * UF + ";
+ break;
+ case VPInstruction::CanonicalIVIncrementNUW:
+ O << "VF * UF +(nuw) ";
+ break;
+ case VPInstruction::BranchOnCond:
+ O << "branch-on-cond";
+ break;
+ case VPInstruction::BranchOnCount:
+ O << "branch-on-count ";
+ break;
+ default:
+ O << Instruction::getOpcodeName(getOpcode());
+ }
+
+ O << FMF;
+
+ for (const VPValue *Operand : operands()) {
+ O << " ";
+ Operand->printAsOperand(O, SlotTracker);
+ }
+
+ if (DL) {
+ O << ", !dbg ";
+ DL.print(O);
+ }
+}
+#endif
+
+void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
+ // Make sure the VPInstruction is a floating-point operation.
+ assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+ Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
+ Opcode == Instruction::FCmp) &&
+ "this op can't take fast-math flags");
+ FMF = FMFNew;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-CALL ";
+
+ auto *CI = cast<CallInst>(getUnderlyingInstr());
+ if (CI->getType()->isVoidTy())
+ O << "void ";
+ else {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+
+ O << "call @" << CI->getCalledFunction()->getName() << "(";
+ printOperands(O, SlotTracker);
+ O << ")";
+}
+
+void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-SELECT ";
+ printAsOperand(O, SlotTracker);
+ O << " = select ";
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getOperand(1)->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getOperand(2)->printAsOperand(O, SlotTracker);
+ O << (InvariantCond ? " (condition is loop invariant)" : "");
+}
+
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
+ printOperands(O, SlotTracker);
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-INDUCTION";
+ if (getTruncInst()) {
+ O << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
+ O << " +\n" << Indent << "\" ";
+ getVPValue(0)->printAsOperand(O, SlotTracker);
+ } else
+ O << " " << VPlanIngredient(IV);
+
+ O << ", ";
+ getStepValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
+bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+ auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
+ auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
+ return StartC && StartC->isZero() && StepC && StepC->isOne();
+}
+
+VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const {
+ return cast<VPCanonicalIVPHIRecipe>(getOperand(0));
+}
+
+bool VPScalarIVStepsRecipe::isCanonical() const {
+ auto *CanIV = getCanonicalIV();
+ // The start value of the steps-recipe must match the start value of the
+ // canonical induction and it must step by 1.
+ if (CanIV->getStartValue() != getStartValue())
+ return false;
+ auto *StepVPV = getStepValue();
+ if (StepVPV->getDef())
+ return false;
+ auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue());
+ return StepC && StepC->isOne();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << Indent << "= SCALAR-STEPS ";
+ printOperands(O, SlotTracker);
+}
+
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-GEP ";
+ O << (IsPtrLoopInvariant ? "Inv" : "Var");
+ size_t IndicesNumber = IsIndexLoopInvariant.size();
+ for (size_t I = 0; I < IndicesNumber; ++I)
+ O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
+
+ O << " ";
+ printAsOperand(O, SlotTracker);
+ O << " = getelementptr ";
+ printOperands(O, SlotTracker);
+}
+
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "BLEND ";
+ Phi->printAsOperand(O, false);
+ O << " =";
+ if (getNumIncomingValues() == 1) {
+ // Not a User of any mask: not really blending, this is a
+ // single-predecessor phi.
+ O << " ";
+ getIncomingValue(0)->printAsOperand(O, SlotTracker);
+ } else {
+ for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
+ O << " ";
+ getIncomingValue(I)->printAsOperand(O, SlotTracker);
+ O << "/";
+ getMask(I)->printAsOperand(O, SlotTracker);
+ }
+ }
+}
+
+void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " +";
+ if (isa<FPMathOperator>(getUnderlyingInstr()))
+ O << getUnderlyingInstr()->getFastMathFlags();
+ O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
+ getVecOp()->printAsOperand(O, SlotTracker);
+ if (getCondOp()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+ if (RdxDesc->IntermediateStore)
+ O << " (with final reduction value stored in invariant address sank "
+ "outside of loop)";
+}
+
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
+
+ if (!getUnderlyingInstr()->getType()->isVoidTy()) {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+ if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
+ O << "call @" << CB->getCalledFunction()->getName() << "(";
+ interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
+ O, [&O, &SlotTracker](VPValue *Op) {
+ Op->printAsOperand(O, SlotTracker);
+ });
+ O << ")";
+ } else {
+ O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
+ printOperands(O, SlotTracker);
+ }
+
+ if (AlsoPack)
+ O << " (S->V)";
+}
+
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "PHI-PREDICATED-INSTRUCTION ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ printOperands(O, SlotTracker);
+}
+
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+
+ if (!isStore()) {
+ getVPSingleValue()->printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+ O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
+ Value *Start = getStartValue()->getLiveInIRValue();
+ PHINode *EntryPart = PHINode::Create(
+ Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ EntryPart->addIncoming(Start, VectorPH);
+ EntryPart->setDebugLoc(DL);
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(this, EntryPart, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = CANONICAL-INDUCTION";
+}
+#endif
+
+bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
+ bool IsUniform = vputils::onlyFirstLaneUsed(this);
+ return all_of(users(),
+ [&](const VPUser *U) { return U->usesScalars(this); }) &&
+ (IsUniform || !VF.isScalable());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = WIDEN-POINTER-INDUCTION ";
+ getStartValue()->printAsOperand(O, SlotTracker);
+ O << ", " << *IndDesc.getStep();
+}
+#endif
+
+void VPExpandSCEVRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "cannot be used in per-lane");
+ const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
+ SCEVExpander Exp(SE, DL, "induction");
+
+ Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
+ &*State.Builder.GetInsertPoint());
+
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(this, Res, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ getVPSingleValue()->printAsOperand(O, SlotTracker);
+ O << " = EXPAND SCEV " << *Expr;
+}
+#endif
+
+void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
+ Value *CanonicalIV = State.get(getOperand(0), 0);
+ Type *STy = CanonicalIV->getType();
+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+ ElementCount VF = State.VF;
+ Value *VStart = VF.isScalar()
+ ? CanonicalIV
+ : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+ Value *VStep = createStepForVF(Builder, STy, VF, Part);
+ if (VF.isVector()) {
+ VStep = Builder.CreateVectorSplat(VF, VStep);
+ VStep =
+ Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
+ }
+ Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
+ State.set(this, CanonicalVectorIV, Part);
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = WIDEN-CANONICAL-INDUCTION ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ // Create a vector from the initial value.
+ auto *VectorInit = getStartValue()->getLiveInIRValue();
+
+ Type *VecTy = State.VF.isScalar()
+ ? VectorInit->getType()
+ : VectorType::get(VectorInit->getType(), State.VF);
+
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ if (State.VF.isVector()) {
+ auto *IdxTy = Builder.getInt32Ty();
+ auto *One = ConstantInt::get(IdxTy, 1);
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
+ auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
+ VectorInit = Builder.CreateInsertElement(
+ PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
+ }
+
+ // Create a phi node for the new recurrence.
+ PHINode *EntryPart = PHINode::Create(
+ VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
+ EntryPart->addIncoming(VectorInit, VectorPH);
+ State.set(this, EntryPart, 0);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPReductionPHIRecipe::execute(VPTransformState &State) {
+ PHINode *PN = cast<PHINode>(getUnderlyingValue());
+ auto &Builder = State.Builder;
+
+ // In order to support recurrences we need to be able to vectorize Phi nodes.
+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+ // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+ // this value when we vectorize all of the instructions that use the PHI.
+ bool ScalarPHI = State.VF.isScalar() || IsInLoop;
+ Type *VecTy =
+ ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
+
+ BasicBlock *HeaderBB = State.CFG.PrevBB;
+ assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
+ "recipe must be in the vector loop header");
+ unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
+ for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+ Value *EntryPart =
+ PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
+ State.set(this, EntryPart, Part);
+ }
+
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+
+ // Reductions do not have to start at zero. They can start with
+ // any loop invariant values.
+ VPValue *StartVPV = getStartValue();
+ Value *StartV = StartVPV->getLiveInIRValue();
+
+ Value *Iden = nullptr;
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
+ // MinMax reduction have the start value as their identify.
+ if (ScalarPHI) {
+ Iden = StartV;
+ } else {
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ StartV = Iden =
+ Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
+ }
+ } else {
+ Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
+ RdxDesc.getFastMathFlags());
+
+ if (!ScalarPHI) {
+ Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ Constant *Zero = Builder.getInt32(0);
+ StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+ }
+ }
+
+ for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+ Value *EntryPart = State.get(this, Part);
+ // Make sure to add the reduction start value only to the
+ // first unroll part.
+ Value *StartVal = (Part == 0) ? StartV : Iden;
+ cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-REDUCTION-PHI ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
+ assert(EnableVPlanNativePath &&
+ "Non-native vplans are not expected to have VPWidenPHIRecipes.");
+
+ // Currently we enter here in the VPlan-native path for non-induction
+ // PHIs where all control flow is uniform. We simply widen these PHIs.
+ // Create a vector phi with no operands - the vector phi operands will be
+ // set at the end of vector code generation.
+ VPBasicBlock *Parent = getParent();
+ VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
+ unsigned StartIdx = 0;
+ // For phis in header blocks of loop regions, use the index of the value
+ // coming from the preheader.
+ if (LoopRegion->getEntryBasicBlock() == Parent) {
+ for (unsigned I = 0; I < getNumOperands(); ++I) {
+ if (getIncomingBlock(I) ==
+ LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
+ StartIdx = I;
+ }
+ }
+ Value *Op0 = State.get(getOperand(StartIdx), 0);
+ Type *VecTy = Op0->getType();
+ Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
+ State.set(this, VecPhi, 0);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-PHI ";
+
+ auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
+ // Unless all incoming values are modeled in VPlan print the original PHI
+ // directly.
+ // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
+ // values as VPValues.
+ if (getNumOperands() != OriginalPhi->getNumOperands()) {
+ O << VPlanIngredient(OriginalPhi);
+ return;
+ }
+
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 9e19e172dea5..3a7e77fd9efd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -15,16 +15,10 @@
//===----------------------------------------------------------------------===//
#include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
+#include "VPlanValue.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
@@ -32,12 +26,9 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <algorithm>
#include <cassert>
-#include <iterator>
#include <utility>
using namespace llvm;
@@ -396,7 +387,7 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
return markFailed();
assert(getOpcode(Values) && "Opcodes for all values must match");
- unsigned ValuesOpcode = getOpcode(Values).getValue();
+ unsigned ValuesOpcode = *getOpcode(Values);
SmallVector<VPValue *, 4> CombinedOperands;
if (areCommutative(Values)) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 70ce773a8a85..cca484e13bf1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -13,6 +13,8 @@
#include "VPlanTransforms.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/IVDescriptors.h"
using namespace llvm;
@@ -22,17 +24,15 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
GetIntOrFpInductionDescriptor,
SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {
- auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
- ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
-
- for (VPBlockBase *Base : RPOT) {
- // Do not widen instructions in pre-header and exit blocks.
- if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
- continue;
-
- VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>>
+ RPOT(Plan->getEntry());
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ VPRecipeBase *Term = VPBB->getTerminator();
+ auto EndIter = Term ? Term->getIterator() : VPBB->end();
// Introduce each ingredient into VPlan.
- for (VPRecipeBase &Ingredient : llvm::make_early_inc_range(*VPBB)) {
+ for (VPRecipeBase &Ingredient :
+ make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
+
VPValue *VPV = Ingredient.getVPSingleValue();
Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
if (DeadInstructions.count(Inst)) {
@@ -47,8 +47,10 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
+ VPValue *Step =
+ vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
NewRecipe =
- new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false, true);
+ new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true);
} else {
Plan->addVPValue(Phi, VPPhi);
continue;
@@ -295,14 +297,19 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
}
void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
- SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
- for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+ for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
if (!IV || IV->getTruncInst())
continue;
- // Visit all casts connected to IV and in Casts. Collect them.
- // remember them for removal.
+ // A sequence of IR Casts has potentially been recorded for IV, which
+ // *must be bypassed* when the IV is vectorized, because the vectorized IV
+ // will produce the desired casted value. This sequence forms a def-use
+ // chain and is provided in reverse order, ending with the cast that uses
+ // the IV phi. Search for the recipe of the last cast in the chain and
+ // replace it with the original IV. Note that only the final cast is
+ // expected to have users outside the cast-chain and the dead casts left
+ // over will be cleaned up later.
auto &Casts = IV->getInductionDescriptor().getCastInsts();
VPValue *FindMyCast = IV;
for (Instruction *IRCast : reverse(Casts)) {
@@ -315,14 +322,9 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
break;
}
}
- assert(FoundUserCast && "Missing a cast to remove");
- CastsToRemove.emplace_back(FoundUserCast, IV);
FindMyCast = FoundUserCast->getVPSingleValue();
}
- }
- for (auto &E : CastsToRemove) {
- E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
- E.first->eraseFromParent();
+ FindMyCast->replaceAllUsesWith(IV);
}
}
@@ -358,3 +360,73 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
}
}
}
+
+void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
+ ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>>
+ RPOT(Plan.getEntry());
+
+ for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
+ // The recipes in the block are processed in reverse order, to catch chains
+ // of dead recipes.
+ for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+ if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) {
+ return V->getNumUsers() > 0;
+ }))
+ continue;
+ R.eraseFromParent();
+ }
+ }
+}
+
+void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
+ SmallVector<VPRecipeBase *> ToRemove;
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
+ for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!IV)
+ continue;
+ if (HasOnlyVectorVFs &&
+ none_of(IV->users(), [IV](VPUser *U) { return U->usesScalars(IV); }))
+ continue;
+
+ const InductionDescriptor &ID = IV->getInductionDescriptor();
+ VPValue *Step =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), SE);
+ Instruction *TruncI = IV->getTruncInst();
+ VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(
+ IV->getPHINode()->getType(), ID, Plan.getCanonicalIV(),
+ IV->getStartValue(), Step, TruncI ? TruncI->getType() : nullptr);
+ HeaderVPBB->insert(Steps, HeaderVPBB->getFirstNonPhi());
+
+ // Update scalar users of IV to use Step instead. Use SetVector to ensure
+ // the list of users doesn't contain duplicates.
+ SetVector<VPUser *> Users(IV->user_begin(), IV->user_end());
+ for (VPUser *U : Users) {
+ if (HasOnlyVectorVFs && !U->usesScalars(IV))
+ continue;
+ for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) {
+ if (U->getOperand(I) != IV)
+ continue;
+ U->setOperand(I, Steps);
+ }
+ }
+ }
+}
+
+void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan &Plan) {
+ DenseMap<const SCEV *, VPValue *> SCEV2VPV;
+
+ for (VPRecipeBase &R :
+ make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
+ auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
+ if (!ExpR)
+ continue;
+
+ auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
+ if (I.second)
+ continue;
+ ExpR->replaceAllUsesWith(I.first->second);
+ ExpR->eraseFromParent();
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e74409a86466..3372e255dff7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -14,8 +14,7 @@
#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
#include "VPlan.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
namespace llvm {
@@ -23,6 +22,7 @@ class InductionDescriptor;
class Instruction;
class PHINode;
class ScalarEvolution;
+class Loop;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
@@ -49,6 +49,18 @@ struct VPlanTransforms {
/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
/// recipe, if it exists.
static void removeRedundantCanonicalIVs(VPlan &Plan);
+
+ static void removeDeadRecipes(VPlan &Plan);
+
+ /// If any user of a VPWidenIntOrFpInductionRecipe needs scalar values,
+ /// provide them by building scalar steps off of the canonical scalar IV and
+ /// update the original IV's users. This is an optional optimization to reduce
+ /// the needs of vector extracts.
+ static void optimizeInductions(VPlan &Plan, ScalarEvolution &SE);
+
+ /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
+ /// them with already existing recipes expanding the same SCEV expression.
+ static void removeRedundantExpandSCEVRecipes(VPlan &Plan);
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 5296d2b9485c..5fc676834331 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -106,6 +106,7 @@ public:
VPVFirstOrderRecurrencePHISC,
VPVWidenPHISC,
VPVWidenIntOrFpInductionSC,
+ VPVWidenPointerInductionSC,
VPVPredInstPHI,
VPVReductionPHISC,
};
@@ -207,9 +208,7 @@ public:
/// Subclass identifier (for isa/dyn_cast).
enum class VPUserID {
Recipe,
- // TODO: Currently VPUsers are used in VPBlockBase, but in the future the
- // only VPUsers should either be recipes or live-outs.
- Block
+ LiveOut,
};
private:
@@ -286,6 +285,22 @@ public:
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *Recipe);
+
+ /// Returns true if the VPUser uses scalars of operand \p Op. Conservatively
+ /// returns if only first (scalar) lane is used, as default.
+ virtual bool usesScalars(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return onlyFirstLaneUsed(Op);
+ }
+
+ /// Returns true if the VPUser only uses the first lane of operand \p Op.
+ /// Conservatively returns false.
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return false;
+ }
};
/// This class augments a recipe with a set of VPValues defined by the recipe.
@@ -327,10 +342,12 @@ public:
/// type identification.
using VPRecipeTy = enum {
VPBranchOnMaskSC,
+ VPExpandSCEVSC,
VPInstructionSC,
VPInterleaveSC,
VPReductionSC,
VPReplicateSC,
+ VPScalarIVStepsSC,
VPWidenCallSC,
VPWidenCanonicalIVSC,
VPWidenGEPSC,
@@ -344,6 +361,7 @@ public:
VPFirstOrderRecurrencePHISC,
VPWidenPHISC,
VPWidenIntOrFpInductionSC,
+ VPWidenPointerInductionSC,
VPPredInstPHISC,
VPReductionPHISC,
VPFirstPHISC = VPBlendSC,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index d36f250995e1..f917883145c0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -43,17 +43,20 @@ static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
/// \p Region. Checks in this function are generic for VPBlockBases. They are
/// not specific for VPBasicBlocks or VPRegionBlocks.
static void verifyBlocksInRegion(const VPRegionBlock *Region) {
- for (const VPBlockBase *VPB :
- make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
- df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ for (const VPBlockBase *VPB : make_range(
+ df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExiting()))) {
// Check block's parent.
assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+ auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
// Check block's condition bit.
- if (VPB->getNumSuccessors() > 1)
- assert(VPB->getCondBit() && "Missing condition bit!");
+ if (VPB->getNumSuccessors() > 1 || (VPBB && VPBB->isExiting()))
+ assert(VPBB && VPBB->getTerminator() &&
+ "Block has multiple successors but doesn't "
+ "have a proper branch recipe!");
else
- assert(!VPB->getCondBit() && "Unexpected condition bit!");
+ assert((!VPBB || !VPBB->getTerminator()) && "Unexpected branch recipe!");
// Check block's successors.
const auto &Successors = VPB->getSuccessors();
@@ -94,13 +97,14 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
static void verifyRegion(const VPRegionBlock *Region) {
const VPBlockBase *Entry = Region->getEntry();
- const VPBlockBase *Exit = Region->getExit();
+ const VPBlockBase *Exiting = Region->getExiting();
- // Entry and Exit shouldn't have any predecessor/successor, respectively.
+ // Entry and Exiting shouldn't have any predecessor/successor, respectively.
assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
- assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+ assert(!Exiting->getNumSuccessors() &&
+ "Region exiting block has successors.");
(void)Entry;
- (void)Exit;
+ (void)Exiting;
verifyBlocksInRegion(Region);
}
@@ -111,9 +115,9 @@ static void verifyRegionRec(const VPRegionBlock *Region) {
verifyRegion(Region);
// Recurse inside nested regions.
- for (const VPBlockBase *VPB :
- make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
- df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ for (const VPBlockBase *VPB : make_range(
+ df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExiting()))) {
if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
verifyRegionRec(SubRegion);
}
@@ -157,7 +161,7 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
}
}
- const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry());
+ const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry());
if (!Entry) {
errs() << "VPlan entry block is not a VPBasicBlock\n";
@@ -170,19 +174,19 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
return false;
}
- const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
- if (!Exit) {
- errs() << "VPlan exit block is not a VPBasicBlock\n";
+ const VPBasicBlock *Exiting = dyn_cast<VPBasicBlock>(TopRegion->getExiting());
+ if (!Exiting) {
+ errs() << "VPlan exiting block is not a VPBasicBlock\n";
return false;
}
- if (Exit->empty()) {
- errs() << "VPlan vector loop exit must end with BranchOnCount "
+ if (Exiting->empty()) {
+ errs() << "VPlan vector loop exiting block must end with BranchOnCount "
"VPInstruction but is empty\n";
return false;
}
- auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end()));
+ auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) {
errs() << "VPlan vector loop exit must end with BranchOnCount "
"VPInstruction\n";
@@ -197,10 +201,17 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
errs() << "region entry block has predecessors\n";
return false;
}
- if (Region->getExit()->getNumSuccessors() != 0) {
- errs() << "region exit block has successors\n";
+ if (Region->getExiting()->getNumSuccessors() != 0) {
+ errs() << "region exiting block has successors\n";
return false;
}
}
+
+ for (auto &KV : Plan.getLiveOuts())
+ if (KV.second->getNumOperands() != 1) {
+ errs() << "live outs must have a single operand\n";
+ return false;
+ }
+
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 258f6c67e54d..90598937affc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -103,11 +103,13 @@ private:
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
+ bool foldShuffleFromReductions(Instruction &I);
+ bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
void replaceValue(Value &Old, Value &New) {
Old.replaceAllUsesWith(&New);
- New.takeName(&Old);
if (auto *NewI = dyn_cast<Instruction>(&New)) {
+ New.takeName(&Old);
Worklist.pushUsersToWorkList(*NewI);
Worklist.pushValue(NewI);
}
@@ -255,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
ExtractElementInst *VectorCombine::getShuffleExtract(
ExtractElementInst *Ext0, ExtractElementInst *Ext1,
unsigned PreferredExtractIndex = InvalidIndex) const {
- assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
- isa<ConstantInt>(Ext1->getIndexOperand()) &&
- "Expected constant extract indexes");
+ auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
+ auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
+ assert(Index0C && Index1C && "Expected constant extract indexes");
- unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
- unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+ unsigned Index0 = Index0C->getZExtValue();
+ unsigned Index1 = Index1C->getZExtValue();
// If the extract indexes are identical, no shuffle is needed.
if (Index0 == Index1)
@@ -306,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
const Instruction &I,
ExtractElementInst *&ConvertToShuffle,
unsigned PreferredExtractIndex) {
- assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
- isa<ConstantInt>(Ext1->getOperand(1)) &&
- "Expected constant extract indexes");
+ auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1));
+ auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1));
+ assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
+
unsigned Opcode = I.getOpcode();
Type *ScalarTy = Ext0->getType();
auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
@@ -331,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// Get cost estimates for the extract elements. These costs will factor into
// both sequences.
- unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
- unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+ unsigned Ext0Index = Ext0IndexC->getZExtValue();
+ unsigned Ext1Index = Ext1IndexC->getZExtValue();
InstructionCost Extract0Cost =
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
@@ -694,8 +697,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
ScalarInst->copyIRFlags(&I);
// Fold the vector constants in the original vectors into a new base vector.
- Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
- : ConstantExpr::get(Opcode, VecC0, VecC1);
+ Value *NewVecC =
+ IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
+ : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
replaceValue(I, *Insert);
return true;
@@ -1015,12 +1019,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
return false;
NumInstChecked++;
}
- }
-
- if (!LastCheckedInst)
- LastCheckedInst = UI;
- else if (LastCheckedInst->comesBefore(UI))
LastCheckedInst = UI;
+ }
auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);
if (!ScalarIdx.isSafe()) {
@@ -1117,6 +1117,339 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
return true;
}
+/// Given a commutative reduction, the order of the input lanes does not alter
+/// the results. We can use this to remove certain shuffles feeding the
+/// reduction, removing the need to shuffle at all.
+bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ return false;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_mul:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_xor:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ break;
+ default:
+ return false;
+ }
+
+ // Find all the inputs when looking through operations that do not alter the
+ // lane order (binops, for example). Currently we look for a single shuffle,
+ // and can ignore splat values.
+ std::queue<Value *> Worklist;
+ SmallPtrSet<Value *, 4> Visited;
+ ShuffleVectorInst *Shuffle = nullptr;
+ if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
+ Worklist.push(Op);
+
+ while (!Worklist.empty()) {
+ Value *CV = Worklist.front();
+ Worklist.pop();
+ if (Visited.contains(CV))
+ continue;
+
+ // Splats don't change the order, so can be safely ignored.
+ if (isSplatValue(CV))
+ continue;
+
+ Visited.insert(CV);
+
+ if (auto *CI = dyn_cast<Instruction>(CV)) {
+ if (CI->isBinaryOp()) {
+ for (auto *Op : CI->operand_values())
+ Worklist.push(Op);
+ continue;
+ } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
+ if (Shuffle && Shuffle != SV)
+ return false;
+ Shuffle = SV;
+ continue;
+ }
+ }
+
+ // Anything else is currently an unknown node.
+ return false;
+ }
+
+ if (!Shuffle)
+ return false;
+
+ // Check all uses of the binary ops and shuffles are also included in the
+ // lane-invariant operations (Visited should be the list of lanewise
+ // instructions, including the shuffle that we found).
+ for (auto *V : Visited)
+ for (auto *U : V->users())
+ if (!Visited.contains(U) && U != &I)
+ return false;
+
+ FixedVectorType *VecType =
+ dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
+ if (!VecType)
+ return false;
+ FixedVectorType *ShuffleInputType =
+ dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
+ if (!ShuffleInputType)
+ return false;
+ int NumInputElts = ShuffleInputType->getNumElements();
+
+ // Find the mask from sorting the lanes into order. This is most likely to
+ // become a identity or concat mask. Undef elements are pushed to the end.
+ SmallVector<int> ConcatMask;
+ Shuffle->getShuffleMask(ConcatMask);
+ sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
+ bool UsesSecondVec =
+ any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });
+ InstructionCost OldCost = TTI.getShuffleCost(
+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+ Shuffle->getShuffleMask());
+ InstructionCost NewCost = TTI.getShuffleCost(
+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+ ConcatMask);
+
+ LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
+ << "\n");
+ LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+ if (NewCost < OldCost) {
+ Builder.SetInsertPoint(Shuffle);
+ Value *NewShuffle = Builder.CreateShuffleVector(
+ Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
+ LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
+ replaceValue(*Shuffle, *NewShuffle);
+ }
+
+ // See if we can re-use foldSelectShuffle, getting it to reduce the size of
+ // the shuffle into a nicer order, as it can ignore the order of the shuffles.
+ return foldSelectShuffle(*Shuffle, true);
+}
+
+/// This method looks for groups of shuffles acting on binops, of the form:
+/// %x = shuffle ...
+/// %y = shuffle ...
+/// %a = binop %x, %y
+/// %b = binop %x, %y
+/// shuffle %a, %b, selectmask
+/// We may, especially if the shuffle is wider than legal, be able to convert
+/// the shuffle to a form where only parts of a and b need to be computed. On
+/// architectures with no obvious "select" shuffle, this can reduce the total
+/// number of operations if the target reports them as cheaper.
+bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
+ auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+ auto *VT = dyn_cast<FixedVectorType>(I.getType());
+ if (!SVI || !VT)
+ return false;
+ auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
+ if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
+ VT != Op0->getType())
+ return false;
+ auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));
+ auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));
+ auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));
+ auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));
+ auto checkSVNonOpUses = [&](Instruction *I) {
+ if (!I || I->getOperand(0)->getType() != VT)
+ return true;
+ return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });
+ };
+ if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
+ checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
+ return false;
+
+ // Collect all the uses that are shuffles that we can transform together. We
+ // may not have a single shuffle, but a group that can all be transformed
+ // together profitably.
+ SmallVector<ShuffleVectorInst *> Shuffles;
+ auto collectShuffles = [&](Instruction *I) {
+ for (auto *U : I->users()) {
+ auto *SV = dyn_cast<ShuffleVectorInst>(U);
+ if (!SV || SV->getType() != VT)
+ return false;
+ if (!llvm::is_contained(Shuffles, SV))
+ Shuffles.push_back(SV);
+ }
+ return true;
+ };
+ if (!collectShuffles(Op0) || !collectShuffles(Op1))
+ return false;
+ // From a reduction, we need to be processing a single shuffle, otherwise the
+ // other uses will not be lane-invariant.
+ if (FromReduction && Shuffles.size() > 1)
+ return false;
+
+ // For each of the output shuffles, we try to sort all the first vector
+ // elements to the beginning, followed by the second array elements at the
+ // end. If the binops are legalized to smaller vectors, this may reduce total
+ // number of binops. We compute the ReconstructMask mask needed to convert
+ // back to the original lane order.
+ SmallVector<int> V1, V2;
+ SmallVector<SmallVector<int>> ReconstructMasks;
+ int MaxV1Elt = 0, MaxV2Elt = 0;
+ unsigned NumElts = VT->getNumElements();
+ for (ShuffleVectorInst *SVN : Shuffles) {
+ SmallVector<int> Mask;
+ SVN->getShuffleMask(Mask);
+
+ // Check the operands are the same as the original, or reversed (in which
+ // case we need to commute the mask).
+ Value *SVOp0 = SVN->getOperand(0);
+ Value *SVOp1 = SVN->getOperand(1);
+ if (SVOp0 == Op1 && SVOp1 == Op0) {
+ std::swap(SVOp0, SVOp1);
+ ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+ }
+ if (SVOp0 != Op0 || SVOp1 != Op1)
+ return false;
+
+ // Calculate the reconstruction mask for this shuffle, as the mask needed to
+ // take the packed values from Op0/Op1 and reconstructing to the original
+ // order.
+ SmallVector<int> ReconstructMask;
+ for (unsigned I = 0; I < Mask.size(); I++) {
+ if (Mask[I] < 0) {
+ ReconstructMask.push_back(-1);
+ } else if (Mask[I] < static_cast<int>(NumElts)) {
+ MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
+ auto It = find(V1, Mask[I]);
+ if (It != V1.end())
+ ReconstructMask.push_back(It - V1.begin());
+ else {
+ ReconstructMask.push_back(V1.size());
+ V1.push_back(Mask[I]);
+ }
+ } else {
+ MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
+ auto It = find(V2, Mask[I] - NumElts);
+ if (It != V2.end())
+ ReconstructMask.push_back(NumElts + It - V2.begin());
+ else {
+ ReconstructMask.push_back(NumElts + V2.size());
+ V2.push_back(Mask[I] - NumElts);
+ }
+ }
+ }
+
+ // For reductions, we know that the lane ordering out doesn't alter the
+ // result. In-order can help simplify the shuffle away.
+ if (FromReduction)
+ sort(ReconstructMask);
+ ReconstructMasks.push_back(ReconstructMask);
+ }
+
+ // If the Maximum element used from V1 and V2 are not larger than the new
+ // vectors, the vectors are already packes and performing the optimization
+ // again will likely not help any further. This also prevents us from getting
+ // stuck in a cycle in case the costs do not also rule it out.
+ if (V1.empty() || V2.empty() ||
+ (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
+ MaxV2Elt == static_cast<int>(V2.size()) - 1))
+ return false;
+
+ // Calculate the masks needed for the new input shuffles, which get padded
+ // with undef
+ SmallVector<int> V1A, V1B, V2A, V2B;
+ for (unsigned I = 0; I < V1.size(); I++) {
+ V1A.push_back(SVI0A->getMaskValue(V1[I]));
+ V1B.push_back(SVI0B->getMaskValue(V1[I]));
+ }
+ for (unsigned I = 0; I < V2.size(); I++) {
+ V2A.push_back(SVI1A->getMaskValue(V2[I]));
+ V2B.push_back(SVI1B->getMaskValue(V2[I]));
+ }
+ while (V1A.size() < NumElts) {
+ V1A.push_back(UndefMaskElem);
+ V1B.push_back(UndefMaskElem);
+ }
+ while (V2A.size() < NumElts) {
+ V2A.push_back(UndefMaskElem);
+ V2B.push_back(UndefMaskElem);
+ }
+
+ auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {
+ return C +
+ TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());
+ };
+ auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+ return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
+ };
+
+ // Get the costs of the shuffles + binops before and after with the new
+ // shuffle masks.
+ InstructionCost CostBefore =
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
+ CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
+ InstructionCost(0), AddShuffleCost);
+ // This set helps us only cost each unique shuffle once.
+ SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(
+ {SVI0A, SVI0B, SVI1A, SVI1B});
+ CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+ InstructionCost(0), AddShuffleCost);
+
+ // The new binops will be unused for lanes past the used shuffle lengths.
+ // These types attempt to get the correct cost for that from the target.
+ FixedVectorType *Op0SmallVT =
+ FixedVectorType::get(VT->getScalarType(), V1.size());
+ FixedVectorType *Op1SmallVT =
+ FixedVectorType::get(VT->getScalarType(), V2.size());
+ InstructionCost CostAfter =
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
+ CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
+ InstructionCost(0), AddShuffleMaskCost);
+ std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
+ CostAfter +=
+ std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
+ InstructionCost(0), AddShuffleMaskCost);
+
+ if (CostBefore <= CostAfter)
+ return false;
+
+ // The cost model has passed, create the new instructions.
+ Builder.SetInsertPoint(SVI0A);
+ Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),
+ SVI0A->getOperand(1), V1A);
+ Builder.SetInsertPoint(SVI0B);
+ Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),
+ SVI0B->getOperand(1), V1B);
+ Builder.SetInsertPoint(SVI1A);
+ Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),
+ SVI1A->getOperand(1), V2A);
+ Builder.SetInsertPoint(SVI1B);
+ Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),
+ SVI1B->getOperand(1), V2B);
+ Builder.SetInsertPoint(Op0);
+ Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
+ NSV0A, NSV0B);
+ if (auto *I = dyn_cast<Instruction>(NOp0))
+ I->copyIRFlags(Op0, true);
+ Builder.SetInsertPoint(Op1);
+ Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
+ NSV1A, NSV1B);
+ if (auto *I = dyn_cast<Instruction>(NOp1))
+ I->copyIRFlags(Op1, true);
+
+ for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
+ Builder.SetInsertPoint(Shuffles[S]);
+ Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
+ replaceValue(*Shuffles[S], *NSV);
+ }
+
+ Worklist.pushValue(NSV0A);
+ Worklist.pushValue(NSV0B);
+ Worklist.pushValue(NSV1A);
+ Worklist.pushValue(NSV1B);
+ for (auto *S : Shuffles)
+ Worklist.add(S);
+ return true;
+}
+
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -1136,6 +1469,8 @@ bool VectorCombine::run() {
MadeChange |= foldBitcastShuf(I);
MadeChange |= foldExtractedCmps(I);
MadeChange |= foldShuffleOfBinops(I);
+ MadeChange |= foldShuffleFromReductions(I);
+ MadeChange |= foldSelectShuffle(I);
}
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 010ca28fc237..208e5eeea864 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -15,7 +15,6 @@
#include "llvm/Transforms/Vectorize.h"
#include "llvm-c/Initialization.h"
#include "llvm-c/Transforms/Vectorize.h"
-#include "llvm/Analysis/Passes.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassRegistry.h"