diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-04 19:20:19 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-02-08 19:02:26 +0000 |
commit | 81ad626541db97eb356e2c1d4a20eb2a26a766ab (patch) | |
tree | 311b6a8987c32b1e1dcbab65c54cfac3fdb56175 /contrib/llvm-project/llvm/lib/Transforms/Vectorize | |
parent | 5fff09660e06a66bed6482da9c70df328e16bbb6 (diff) | |
parent | 145449b1e420787bb99721a429341fa6be3adfb6 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
21 files changed, 6532 insertions, 3776 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 97c2acb7d4c7..f59fc3a6dd60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -62,14 +62,13 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -497,7 +496,7 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, if (PtrDelta.urem(Stride) != 0) return false; unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); - APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); + APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth); // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) @@ -1298,10 +1297,16 @@ bool Vectorizer::vectorizeLoadChain( CV->replaceAllUsesWith(V); } - // Bitcast might not be an Instruction, if the value being loaded is a - // constant. In that case, no need to reorder anything. - if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) - reorder(BitcastInst); + // Since we might have opaque pointers we might end up using the pointer + // operand of the first load (wrt. memory loaded) for the vector load. Since + // this first load might not be the first in the block we potentially need to + // reorder the pointer operand (and its operands). If we have a bitcast though + // it might be before the load and should be the reorder start instruction. + // "Might" because for opaque pointers the "bitcast" is just the first loads + // pointer operand, as oppposed to something we inserted at the right position + // ourselves. + Instruction *BCInst = dyn_cast<Instruction>(Bitcast); + reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI); eraseInstructions(Chain); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 81e5aa223c07..6242d9a93fc1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -17,7 +17,9 @@ #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" @@ -31,8 +33,6 @@ using namespace PatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME -extern cl::opt<bool> EnableVPlanPredication; - static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -439,6 +439,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, return false; } +/// Returns true if A and B have same pointer operands or same SCEVs addresses +static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, + StoreInst *B) { + // Compare store + if (A == B) + return true; + + // Otherwise Compare pointers + Value *APtr = A->getPointerOperand(); + Value *BPtr = B->getPointerOperand(); + if (APtr == BPtr) + return true; + + // Otherwise compare address SCEVs + if (SE->getSCEV(APtr) == SE->getSCEV(BPtr)) + return true; + + return false; +} + int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { const ValueToValueMap &Strides = @@ -487,7 +507,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { // FIXME: We skip these checks when VPlan predication is enabled as we // want to allow divergent branches. This whole check will be removed // once VPlan predication is on by default. - if (!EnableVPlanPredication && Br && Br->isConditional() && + if (Br && Br->isConditional() && !TheLoop->isLoopInvariant(Br->getCondition()) && !LI->isLoopHeader(Br->getSuccessor(0)) && !LI->isLoopHeader(Br->getSuccessor(1))) { @@ -572,7 +592,7 @@ void LoopVectorizationLegality::addInductionPhi( // on predicates that only hold within the loop, since allowing the exit // currently means re-using this SCEV outside the loop (see PR33706 for more // details). - if (PSE.getUnionPredicate().isAlwaysTrue()) { + if (PSE.getPredicate().isAlwaysTrue()) { AllowedExit.insert(Phi); AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); } @@ -676,7 +696,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { RecurrenceDescriptor RedDes; if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT)) { + DT, PSE.getSE())) { Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); Reductions[Phi] = RedDes; @@ -770,7 +790,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { auto *SE = PSE.getSE(); Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) - if (hasVectorInstrinsicScalarOpd(IntrinID, i)) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) { if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { reportVectorizationFailure("Found unvectorizable intrinsic", "intrinsic instruction cannot be vectorized", @@ -849,7 +869,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // used outside the loop only if the SCEV predicates within the loop is // same as outside the loop. Allowing the exit means reusing the SCEV // outside the loop. - if (PSE.getUnionPredicate().isAlwaysTrue()) { + if (PSE.getPredicate().isAlwaysTrue()) { AllowedExit.insert(&I); continue; } @@ -911,15 +931,70 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { - reportVectorizationFailure("Stores to a uniform address", - "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); - return false; + // We can vectorize stores to invariant address when final reduction value is + // guaranteed to be stored at the end of the loop. Also, if decision to + // vectorize loop is made, runtime checks are added so as to make sure that + // invariant address won't alias with any other objects. + if (!LAI->getStoresToInvariantAddresses().empty()) { + // For each invariant address, check its last stored value is unconditional. + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isInvariantStoreOfReduction(SI) && + blockNeedsPredication(SI->getParent())) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write of conditional recurring variant value to a loop " + "invariant address could not be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } + + if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { + // For each invariant address, check its last stored value is the result + // of one of our reductions. + // + // We do not check if dependence with loads exists because they are + // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this + // behaviour changes we have to modify this code. + ScalarEvolution *SE = PSE.getSE(); + SmallVector<StoreInst *, 4> UnhandledStores; + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isInvariantStoreOfReduction(SI)) { + // Earlier stores to this address are effectively deadcode. + // With opaque pointers it is possible for one pointer to be used with + // different sizes of stored values: + // store i32 0, ptr %x + // store i8 0, ptr %x + // The latest store doesn't complitely overwrite the first one in the + // example. That is why we have to make sure that types of stored + // values are same. + // TODO: Check that bitwidth of unhandled store is smaller then the + // one that overwrites it and add a test. + erase_if(UnhandledStores, [SE, SI](StoreInst *I) { + return storeToSameAddress(SE, SI, I) && + I->getValueOperand()->getType() == + SI->getValueOperand()->getType(); + }); + continue; + } + UnhandledStores.push_back(SI); + } + + bool IsOK = UnhandledStores.empty(); + // TODO: we should also validate against InvariantMemSets. + if (!IsOK) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write to a loop invariant address could not " + "be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); - PSE.addPredicate(LAI->getPSE().getUnionPredicate()); + PSE.addPredicate(LAI->getPSE().getPredicate()); return true; } @@ -949,6 +1024,26 @@ bool LoopVectorizationLegality::canVectorizeFPMath( })); } +bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { + return any_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RdxDesc.IntermediateStore == SI; + }); +} + +bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { + return any_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + if (!RdxDesc.IntermediateStore) + return false; + + ScalarEvolution *SE = PSE.getSE(); + Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); + return V == InvariantAddress || + SE->getSCEV(V) == SE->getSCEV(InvariantAddress); + }); +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); @@ -969,6 +1064,16 @@ LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { return nullptr; } +const InductionDescriptor * +LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const { + if (!isInductionPhi(Phi)) + return nullptr; + auto &ID = getInductionVars().find(Phi)->second; + if (ID.getKind() == InductionDescriptor::IK_PtrInduction) + return &ID; + return nullptr; +} + bool LoopVectorizationLegality::isCastedInductionVariable( const Value *V) const { auto *Inst = dyn_cast<Instruction>(V); @@ -1266,7 +1371,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; - if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { + if (PSE.getPredicate().getComplexity() > SCEVThreshold) { reportVectorizationFailure("Too many SCEV checks needed", "Too many SCEV assumptions need to be made and checked at runtime", "TooManySCEVRunTimeChecks", ORE, TheLoop); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 71eb39a18d2f..0cb2032fa45a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -25,6 +25,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H #include "VPlan.h" +#include "llvm/Support/InstructionCost.h" namespace llvm { @@ -59,7 +60,7 @@ class VPBuilder { } public: - VPBuilder() {} + VPBuilder() = default; /// Clear the insertion point: created instructions will not be inserted into /// a block. @@ -187,12 +188,16 @@ struct VectorizationFactor { /// Cost of the loop with that width. InstructionCost Cost; - VectorizationFactor(ElementCount Width, InstructionCost Cost) - : Width(Width), Cost(Cost) {} + /// Cost of the scalar loop. + InstructionCost ScalarCost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost, + InstructionCost ScalarCost) + : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0}; + return {ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { @@ -298,8 +303,12 @@ public: /// Generate the IR code for the body of the vectorized loop according to the /// best selected \p VF, \p UF and VPlan \p BestPlan. + /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue + /// vectorization re-using plans for both the main and epilogue vector loops. + /// It should be removed once the re-use issue has been fixed. void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, - InnerLoopVectorizer &LB, DominatorTree *DT); + InnerLoopVectorizer &LB, DominatorTree *DT, + bool IsEpilogueVectorization); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); @@ -319,6 +328,9 @@ public: getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, VFRange &Range); + /// Check if the number of runtime checks exceeds the threshold. + bool requiresTooManyRuntimeChecks() const; + protected: /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 51d2c6237af1..b637b2d5ddae 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,7 +58,6 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" -#include "VPlanPredicator.h" #include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" @@ -112,7 +111,6 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -144,10 +142,10 @@ #include <algorithm> #include <cassert> #include <cstdint> -#include <cstdlib> #include <functional> #include <iterator> #include <limits> +#include <map> #include <memory> #include <string> #include <tuple> @@ -346,13 +344,6 @@ cl::opt<bool> EnableVPlanNativePath( cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); -// FIXME: Remove this switch once we have divergence analysis. Currently we -// assume divergent non-backedge branches when this switch is true. -cl::opt<bool> EnableVPlanPredication( - "enable-vplan-predication", cl::init(false), cl::Hidden, - cl::desc("Enable VPlan-native vectorization path predicator with " - "support for outer loop vectorization.")); - // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the @@ -481,7 +472,7 @@ public: VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. - void fixVectorizedLoop(VPTransformState &State); + void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -491,12 +482,6 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; - /// Vectorize a single first-order recurrence or pointer induction PHINode in - /// a block. This method handles the induction variable canonicalization. It - /// supports both VF = 1 for unrolled loops and arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, - VPTransformState &State); - /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, @@ -506,13 +491,6 @@ public: const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. \p CanonicalIV is the scalar value generated for - /// the canonical induction variable. - void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, - VPTransformState &State, Value *CanonicalIV); - /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State); @@ -527,13 +505,8 @@ public: ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); - /// Set the debug location in the builder \p Ptr using the debug location in - /// \p V. If \p Ptr is None then it uses the class member's Builder. - void setDebugLocFromInst(const Value *V, - Optional<IRBuilder<> *> CustomBuilder = None); - - /// Fix the non-induction PHIs in the OrigPHIsToFix vector. - void fixNonInductionPHIs(VPTransformState &State); + /// Fix the non-induction PHIs in \p Plan. + void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); /// Returns true if the reordering of FP operations is not allowed, but we are /// able to vectorize with strict in-order reductions for the given RdxDesc. @@ -546,17 +519,6 @@ public: /// element. virtual Value *getBroadcastInstrs(Value *V); - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Instruction *To, Instruction *From); - - /// Similar to the previous function but it adds the metadata to a - /// vector of instructions. - void addMetadata(ArrayRef<Value *> To, Instruction *From); - // Returns the resume value (bc.merge.rdx) for a reduction as // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); @@ -575,13 +537,9 @@ protected: /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *CountRoundDown, Value *EndValue, - BasicBlock *MiddleBlock); - - /// Introduce a conditional branch (on true, condition to be set later) at the - /// end of the header=latch connecting it to itself (across the backedge) and - /// to the exit block of \p L. - void createHeaderBranch(Loop *L); + Value *VectorTripCount, Value *EndValue, + BasicBlock *MiddleBlock, BasicBlock *VectorHeader, + VPlan &Plan); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -595,16 +553,9 @@ protected: void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); /// Clear NSW/NUW flags from reduction instructions if necessary. - void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, + void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, VPTransformState &State); - /// Fixup the LCSSA phi nodes in the unique exit block. This simply - /// means we need to add the appropriate incoming value from the middle - /// block as exiting edges from the scalar epilogue loop (if present) are - /// already in place, and we exit the vector loop exclusively to the middle - /// block. - void fixLCSSAPHIs(VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -613,30 +564,11 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// Compute scalar induction steps. \p ScalarIV is the scalar induction - /// variable on which to base the steps, \p Step is the size of the step, and - /// \p EntryVal is the value from the original loop that maps to the steps. - /// Note that \p EntryVal doesn't have to be an induction variable - it - /// can also be a truncate instruction. - void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, - const InductionDescriptor &ID, VPValue *Def, - VPTransformState &State); - - /// Create a vector induction phi node based on an existing scalar one. \p - /// EntryVal is the value from the original loop that maps to the vector phi - /// node, and \p Step is the loop-invariant step. If \p EntryVal is a - /// truncate instruction, instead of widening the original IV, we widen a - /// version of the IV truncated to \p EntryVal's type. - void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, - Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, - VPTransformState &State); - /// Returns (and creates if needed) the original loop trip count. - Value *getOrCreateTripCount(Loop *NewLoop); + Value *getOrCreateTripCount(BasicBlock *InsertBlock); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(Loop *NewLoop); + Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); /// Returns a bitcasted value to the requested vector type. /// Also handles bitcasts of vector<float> <-> vector<pointer> types. @@ -645,33 +577,21 @@ protected: /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. - void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + void emitIterationCountCheck(BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. Returns the block containing the checks or /// nullptr if no checks have been added. - BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); + BasicBlock *emitSCEVChecks(BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. /// Returns the block containing the checks or nullptr if no checks have been /// added. - BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); - - /// Compute the transformed value of Index at offset StartValue using step - /// StepValue. - /// For integer induction, returns StartValue + Index * StepValue. - /// For pointer induction, returns StartValue[Index * StepValue]. - /// FIXME: The newly created binary instructions should contain nsw/nuw - /// flags, which can be found from the original scalar operations. - Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, - const DataLayout &DL, - const InductionDescriptor &ID, - BasicBlock *VectorHeader) const; + BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, - /// vector loop preheader, middle block and scalar preheader. Also - /// allocate a loop object for the new vector loop and return it. - Loop *createVectorLoopSkeleton(StringRef Prefix); + /// vector loop preheader, middle block and scalar preheader. + void createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count /// in the scalar epilogue, from where the vectorized loop left off. @@ -680,21 +600,12 @@ protected: /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( - Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and - /// running the verifier. Take in the vector loop \p L as argument, and return - /// the preheader of the completed vector loop. - BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); - - /// Add additional metadata to \p To that was not present on \p Orig. - /// - /// Currently this is used to add the noalias annotations based on the - /// inserted memchecks. Use this for instructions that are *cloned* into the - /// vector loop. - void addNewMetadata(Instruction *To, const Instruction *Orig); + /// running the verifier. Return the preheader of the completed vector loop. + BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); /// Collect poison-generating recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those @@ -741,13 +652,6 @@ protected: /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - /// LoopVersioning. It's only set up (non-null) if memchecks were - /// used. - /// - /// This is currently only used to add no-alias metadata based on the - /// memchecks. The actually versioning is performed manually. - std::unique_ptr<LoopVersioning> LVer; - /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. ElementCount VF; @@ -774,9 +678,6 @@ protected: /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; - /// The vector loop body. - BasicBlock *LoopVectorBody; - /// The scalar loop body. BasicBlock *LoopScalarBody; @@ -805,10 +706,6 @@ protected: // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; - // Vector of original scalar PHIs whose corresponding widened PHIs need to be - // fixed up at the end of vector code generation. - SmallVector<PHINode *, 8> OrigPHIsToFix; - /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; @@ -936,8 +833,7 @@ protected: /// Emits an iteration count bypass check once for the main loop (when \p /// ForEpilogue is false) and once for the epilogue loop (when \p /// ForEpilogue is true). - BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, - bool ForEpilogue); + BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; @@ -956,7 +852,9 @@ public: BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI, Checks) {} + EPI, LVL, CM, BFI, PSI, Checks) { + TripCount = EPI.TripCount; + } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). std::pair<BasicBlock *, Value *> @@ -966,7 +864,7 @@ protected: /// Emits an iteration count bypass check after the main vector loop has /// finished to see if there are any iterations left to execute by either /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, + BasicBlock *emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert); void printDebugTracesAtStart() override; @@ -993,31 +891,6 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { return I; } -void InnerLoopVectorizer::setDebugLocFromInst( - const Value *V, Optional<IRBuilder<> *> CustomBuilder) { - IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { - const DILocation *DIL = Inst->getDebugLoc(); - - // When a FSDiscriminator is enabled, we don't need to add the multiply - // factors to the discriminators. - if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { - // FIXME: For scalable vectors, assume vscale=1. - auto NewDIL = - DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); - if (NewDIL) - B->SetCurrentDebugLocation(NewDIL.getValue()); - else - LLVM_DEBUG(dbgs() - << "Failed to create new discriminator: " - << DIL->getFilename() << " Line: " << DIL->getLine()); - } else - B->SetCurrentDebugLocation(DIL); - } else - B->SetCurrentDebugLocation(DebugLoc()); -} - /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I /// is passed, the message relates to that particular instruction. #ifndef NDEBUG @@ -1059,7 +932,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, namespace llvm { /// Return a value for Step multiplied by VF. -Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, +Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); @@ -1067,12 +940,13 @@ Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, } /// Return the runtime value for VF. -Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { +Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(EC) : EC; } -static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { +static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, + ElementCount VF) { assert(FTy->isFloatingPointTy() && "Expected floating point type!"); Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); @@ -1119,14 +993,6 @@ static std::string getDebugLocString(const Loop *L) { } #endif -void InnerLoopVectorizer::addNewMetadata(Instruction *To, - const Instruction *Orig) { - // If the loop was versioned with memchecks, add the corresponding no-alias - // metadata. - if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) - LVer->annotateInstWithNoAlias(To, Orig); -} - void InnerLoopVectorizer::collectPoisonGeneratingRecipes( VPTransformState &State) { @@ -1151,6 +1017,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( // handled. if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || + isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPCanonicalIVPHIRecipe>(CurRec)) continue; @@ -1176,10 +1043,10 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { for (VPRecipeBase &Recipe : *VPBB) { if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { - Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); + Instruction &UnderlyingInstr = WidenRec->getIngredient(); VPDef *AddrDef = WidenRec->getAddr()->getDef(); - if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && - Legal->blockNeedsPredication(UnderlyingInstr->getParent())) + if (AddrDef && WidenRec->isConsecutive() && + Legal->blockNeedsPredication(UnderlyingInstr.getParent())) collectPoisonGeneratingInstrsInBackwardSlice( cast<VPRecipeBase>(AddrDef)); } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { @@ -1206,20 +1073,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( } } -void InnerLoopVectorizer::addMetadata(Instruction *To, - Instruction *From) { - propagateMetadata(To, From); - addNewMetadata(To, From); -} - -void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, - Instruction *From) { - for (Value *V : To) { - if (Instruction *I = dyn_cast<Instruction>(V)) - addMetadata(I, From); - } -} - PHINode *InnerLoopVectorizer::getReductionResumeValue( const RecurrenceDescriptor &RdxDesc) { auto It = ReductionResumeValues.find(&RdxDesc); @@ -1363,7 +1216,7 @@ public: /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, /// the IsOrdered flag of RdxDesc is set and we do not allow reordering /// of FP operations. - bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { + bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { return !Hints->allowReordering() && RdxDesc.isOrdered(); } @@ -1718,15 +1571,10 @@ private: /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. - /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure - /// issue that occurred on one of the buildbots which cannot be reproduced - /// without having access to the properietary compiler (see comments on - /// D98509). The issue is currently under investigation and this workaround - /// will be removed as soon as possible. ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF, + ElementCount MaxSafeVF, bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number @@ -2017,7 +1865,7 @@ public: /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, - const SCEVUnionPredicate &UnionPred) { + const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -2040,9 +1888,19 @@ public: MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); - MemRuntimeCheckCond = - addRuntimeChecks(MemCheckBlock->getTerminator(), L, - RtPtrChecking.getChecks(), MemCheckExp); + auto DiffChecks = RtPtrChecking.getDiffChecks(); + if (DiffChecks) { + MemRuntimeCheckCond = addDiffRuntimeChecks( + MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, + [VF](IRBuilderBase &B, unsigned Bits) { + return getRuntimeVF(B, B.getIntNTy(Bits), VF); + }, + IC); + } else { + MemRuntimeCheckCond = + addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp); + } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required"); @@ -2114,12 +1972,16 @@ public: /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and /// adjusts the branches to branch to the vector preheader or \p Bypass, /// depending on the generated condition. - BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *emitSCEVChecks(BasicBlock *Bypass, BasicBlock *LoopVectorPreHeader, BasicBlock *LoopExitBlock) { if (!SCEVCheckCond) return nullptr; - if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) + + Value *Cond = SCEVCheckCond; + // Mark the check as used, to prevent it from being removed during cleanup. + SCEVCheckCond = nullptr; + if (auto *C = dyn_cast<ConstantInt>(Cond)) if (C->isZero()) return nullptr; @@ -2138,18 +2000,15 @@ public: DT->addNewBlock(SCEVCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); - ReplaceInstWithInst( - SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); - // Mark the check as used, to prevent it from being removed during cleanup. - SCEVCheckCond = nullptr; + ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); return SCEVCheckBlock; } /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts /// the branches to branch to the vector preheader or \p Bypass, depending on /// the generated condition. - BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, BasicBlock *LoopVectorPreHeader) { // Check if we generated code that checks in runtime if arrays overlap. if (!MemRuntimeCheckCond) @@ -2346,7 +2205,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { /// \p Opcode is relevant for FP induction variable. static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, Instruction::BinaryOps BinOp, ElementCount VF, - IRBuilder<> &Builder) { + IRBuilderBase &Builder) { assert(VF.isVector() && "only vector VFs are supported"); // Create and check the types. @@ -2362,9 +2221,8 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, // Create a vector of consecutive numbers from zero to VF. VectorType *InitVecValVTy = ValVTy; - Type *InitVecValSTy = STy; if (STy->isFloatingPointTy()) { - InitVecValSTy = + Type *InitVecValSTy = IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); InitVecValVTy = VectorType::get(InitVecValSTy, VLen); } @@ -2394,198 +2252,12 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); } -void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( - const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, VPTransformState &State) { - IRBuilder<> &Builder = State.Builder; - assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // Construct the initial value of the vector IV in the vector loop preheader - auto CurrIP = Builder.saveIP(); - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - if (isa<TruncInst>(EntryVal)) { - assert(Start->getType()->isIntegerTy() && - "Truncation requires an integer type"); - auto *TruncType = cast<IntegerType>(EntryVal->getType()); - Step = Builder.CreateTrunc(Step, TruncType); - Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); - } - - Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); - Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = getStepVector( - SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); - - // We create vector phi nodes for both integer and floating-point induction - // variables. Here, we determine the kind of arithmetic we will perform. - Instruction::BinaryOps AddOp; - Instruction::BinaryOps MulOp; - if (Step->getType()->isIntegerTy()) { - AddOp = Instruction::Add; - MulOp = Instruction::Mul; - } else { - AddOp = II.getInductionOpcode(); - MulOp = Instruction::FMul; - } - - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); - Builder.restoreIP(CurrIP); - - // We may need to add the step a number of times, depending on the unroll - // factor. The last of those goes into the PHI. - PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", - &*LoopVectorBody->getFirstInsertionPt()); - VecInd->setDebugLoc(EntryVal->getDebugLoc()); - Instruction *LastInduction = VecInd; - for (unsigned Part = 0; Part < UF; ++Part) { - State.set(Def, LastInduction, Part); - - if (isa<TruncInst>(EntryVal)) - addMetadata(LastInduction, EntryVal); - - LastInduction = cast<Instruction>( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); - LastInduction->setDebugLoc(EntryVal->getDebugLoc()); - } - - // Move the last step to the end of the latch block. This ensures consistent - // placement of all induction updates. - auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); - LastInduction->moveBefore(Br); - LastInduction->setName("vec.ind.next"); - - VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); - VecInd->addIncoming(LastInduction, LoopVectorLatch); -} - -void InnerLoopVectorizer::widenIntOrFpInduction( - PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, - Value *CanonicalIV) { - Value *Start = Def->getStartValue()->getLiveInIRValue(); - const InductionDescriptor &ID = Def->getInductionDescriptor(); - TruncInst *Trunc = Def->getTruncInst(); - IRBuilder<> &Builder = State.Builder; - assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - assert(!State.VF.isZero() && "VF must be non-zero"); - - // The value from the original loop to which we are mapping the new induction - // variable. - Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - - auto &DL = EntryVal->getModule()->getDataLayout(); - - // Generate code for the induction step. Note that induction steps are - // required to be loop-invariant - auto CreateStepValue = [&](const SCEV *Step) -> Value * { - assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && - "Induction step should be loop invariant"); - if (PSE.getSE()->isSCEVable(IV->getType())) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - return Exp.expandCodeFor(Step, Step->getType(), - State.CFG.VectorPreHeader->getTerminator()); - } - return cast<SCEVUnknown>(Step)->getValue(); - }; - - // The scalar value to broadcast. This is derived from the canonical - // induction variable. If a truncation type is given, truncate the canonical - // induction variable and step. Otherwise, derive these values from the - // induction descriptor. - auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = CanonicalIV; - Type *NeededType = IV->getType(); - if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { - ScalarIV = - NeededType->isIntegerTy() - ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) - : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); - ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, - State.CFG.PrevBB); - ScalarIV->setName("offset.idx"); - } - if (Trunc) { - auto *TruncType = cast<IntegerType>(Trunc->getType()); - assert(Step->getType()->isIntegerTy() && - "Truncation requires an integer step"); - ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); - Step = Builder.CreateTrunc(Step, TruncType); - } - return ScalarIV; - }; - - // Fast-math-flags propagate from the original induction instruction. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) - Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); - - // Now do the actual transformations, and start with creating the step value. - Value *Step = CreateStepValue(ID.getStep()); - if (State.VF.isScalar()) { - Value *ScalarIV = CreateScalarIV(Step); - Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), - Step->getType()->getScalarSizeInBits()); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx = ConstantInt::get(ScalarTy, Part); - Value *EntryPart; - if (Step->getType()->isFloatingPointTy()) { - StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); - Value *MulOp = Builder.CreateFMul(StartIdx, Step); - EntryPart = Builder.CreateBinOp(ID.getInductionOpcode(), ScalarIV, - MulOp, "induction"); - } else { - EntryPart = Builder.CreateAdd( - ScalarIV, Builder.CreateMul(StartIdx, Step), "induction"); - } - State.set(Def, EntryPart, Part); - if (Trunc) { - assert(!Step->getType()->isFloatingPointTy() && - "fp inductions shouldn't be truncated"); - addMetadata(EntryPart, Trunc); - } - } - return; - } - - // Create a new independent vector induction variable, if one is needed. - if (Def->needsVectorIV()) - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); - - if (Def->needsScalarIV()) { - // Create scalar steps that can be used by instructions we will later - // scalarize. Note that the addition of the scalar steps will not increase - // the number of instructions in the loop in the common case prior to - // InstCombine. We will be trading one vector extract for each scalar step. - Value *ScalarIV = CreateScalarIV(Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); - } -} - -void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Instruction *EntryVal, - const InductionDescriptor &ID, - VPValue *Def, - VPTransformState &State) { - IRBuilder<> &Builder = State.Builder; +/// Compute scalar induction steps. \p ScalarIV is the scalar induction +/// variable on which to base the steps, \p Step is the size of the step. +static void buildScalarSteps(Value *ScalarIV, Value *Step, + const InductionDescriptor &ID, VPValue *Def, + VPTransformState &State) { + IRBuilderBase &Builder = State.Builder; // We shouldn't have to build scalar steps if we aren't vectorizing. assert(State.VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. @@ -2656,6 +2328,103 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, } } +// Generate code for the induction step. Note that induction steps are +// required to be loop-invariant +static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, + Instruction *InsertBefore, + Loop *OrigLoop = nullptr) { + const DataLayout &DL = SE.getDataLayout(); + assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && + "Induction step should be loop invariant"); + if (auto *E = dyn_cast<SCEVUnknown>(Step)) + return E->getValue(); + + SCEVExpander Exp(SE, DL, "induction"); + return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); +} + +/// Compute the transformed value of Index at offset StartValue using step +/// StepValue. +/// For integer induction, returns StartValue + Index * StepValue. +/// For pointer induction, returns StartValue[Index * StepValue]. +/// FIXME: The newly created binary instructions should contain nsw/nuw +/// flags, which can be found from the original scalar operations. +static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, + Value *StartValue, Value *Step, + const InductionDescriptor &ID) { + assert(Index->getType()->getScalarType() == Step->getType() && + "Index scalar type does not match StepValue type"); + + // Note: the IR at this point is broken. We cannot use SE to create any new + // SCEV and then expand it, hoping that SCEV's simplification will give us + // a more optimal code. Unfortunately, attempt of doing so on invalid IR may + // lead to various SCEV crashes. So all we can do is to use builder and rely + // on InstCombine for future simplifications. Here we handle some trivial + // cases only. + auto CreateAdd = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast<ConstantInt>(X)) + if (CX->isZero()) + return Y; + if (auto *CY = dyn_cast<ConstantInt>(Y)) + if (CY->isZero()) + return X; + return B.CreateAdd(X, Y); + }; + + // We allow X to be a vector type, in which case Y will potentially be + // splatted into a vector with the same element count. + auto CreateMul = [&B](Value *X, Value *Y) { + assert(X->getType()->getScalarType() == Y->getType() && + "Types don't match!"); + if (auto *CX = dyn_cast<ConstantInt>(X)) + if (CX->isOne()) + return Y; + if (auto *CY = dyn_cast<ConstantInt>(Y)) + if (CY->isOne()) + return X; + VectorType *XVTy = dyn_cast<VectorType>(X->getType()); + if (XVTy && !isa<VectorType>(Y->getType())) + Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); + return B.CreateMul(X, Y); + }; + + switch (ID.getKind()) { + case InductionDescriptor::IK_IntInduction: { + assert(!isa<VectorType>(Index->getType()) && + "Vector indices not supported for integer inductions yet"); + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) + return B.CreateSub(StartValue, Index); + auto *Offset = CreateMul(Index, Step); + return CreateAdd(StartValue, Offset); + } + case InductionDescriptor::IK_PtrInduction: { + assert(isa<Constant>(Step) && + "Expected constant step for pointer induction"); + return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); + } + case InductionDescriptor::IK_FpInduction: { + assert(!isa<VectorType>(Index->getType()) && + "Vector indices not supported for FP inductions yet"); + assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + auto InductionBinOp = ID.getInductionBinOp(); + assert(InductionBinOp && + (InductionBinOp->getOpcode() == Instruction::FAdd || + InductionBinOp->getOpcode() == Instruction::FSub) && + "Original bin op should be defined for FP induction"); + + Value *MulExp = B.CreateFMul(Step, Index); + return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, + "induction"); + } + case InductionDescriptor::IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); +} + void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State) { @@ -2738,7 +2507,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - setDebugLocFromInst(AddrPart); + State.setDebugLocFromInst(AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2764,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } - setDebugLocFromInst(Instr); + State.setDebugLocFromInst(Instr); Value *PoisonVec = PoisonValue::get(VecTy); Value *MaskForGaps = nullptr; @@ -2919,8 +2688,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, if (!Instance.isFirstIteration()) return; - setDebugLocFromInst(Instr); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); @@ -2937,21 +2704,23 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) Cloned->dropPoisonGeneratingFlags(); - State.Builder.SetInsertPoint(Builder.GetInsertBlock(), - Builder.GetInsertPoint()); + if (Instr->getDebugLoc()) + State.setDebugLocFromInst(Instr); + // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; VPValue *Operand = I.value(); - if (State.Plan->isUniformAfterVectorization(Operand)) + VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); + if (OperandR && OperandR->isUniform()) InputInstance.Lane = VPLane::getFirstLane(); Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } - addNewMetadata(Cloned, Instr); + State.addNewMetadata(Cloned, Instr); // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + State.Builder.Insert(Cloned); State.set(RepRecipe, Cloned, Instance); @@ -2964,29 +2733,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(Cloned); } -void InnerLoopVectorizer::createHeaderBranch(Loop *L) { - BasicBlock *Header = L->getHeader(); - assert(!L->getLoopLatch() && "loop should not have a latch at this point"); - - IRBuilder<> B(Header->getTerminator()); - Instruction *OldInst = - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - setDebugLocFromInst(OldInst, &B); - - // Connect the header to the exit and header blocks and replace the old - // terminator. - B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); - - // Now we have two terminators. Remove the old one from the block. - Header->getTerminator()->eraseFromParent(); -} - -Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { +Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { if (TripCount) return TripCount; - assert(L && "Create Trip Count for null loop."); - IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + assert(InsertBlock); + IRBuilder<> Builder(InsertBlock->getTerminator()); // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); @@ -3010,7 +2762,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. @@ -3018,22 +2770,23 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // Count holds the overall loop count (N). TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - L->getLoopPreheader()->getTerminator()); + InsertBlock->getTerminator()); if (TripCount->getType()->isPointerTy()) TripCount = CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", - L->getLoopPreheader()->getTerminator()); + InsertBlock->getTerminator()); return TripCount; } -Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { +Value * +InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; - Value *TC = getOrCreateTripCount(L); - IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Value *TC = getOrCreateTripCount(InsertBlock); + IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. @@ -3045,6 +2798,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // overflows: the vector induction variable will eventually wrap to zero given // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. + // For scalable vectors the VF is not guaranteed to be a power of 2, but this + // is accounted for in emitIterationCountCheck that adds an overflow check. if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); @@ -3107,9 +2862,8 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } -void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, - BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(L); +void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -3124,10 +2878,23 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. + Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) { - Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); + Value *Step = createStepForVF(Builder, CountTy, VF, UF); + if (!Cost->foldTailByMasking()) CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + else if (VF.isScalable()) { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + + // Get the maximum unsigned value for the type. + Value *MaxUIntTripCount = + ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); + Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); + + // Don't execute the vector loop if (UMax - n) < (VF * UF). + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); } // Create new preheader for vector loop. LoopVectorPreHeader = @@ -3152,10 +2919,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, LoopBypassBlocks.push_back(TCCheckBlock); } -BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { BasicBlock *const SCEVCheckBlock = - RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); + RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) return nullptr; @@ -3180,14 +2947,13 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { return SCEVCheckBlock; } -BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, - BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) return nullptr; BasicBlock *const MemCheckBlock = - RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); + RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); // Check if we generated code that checks in runtime if arrays overlap. We put // the checks into a separate block to make the more common case of few @@ -3201,7 +2967,8 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, "to vectorize."); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", - L->getStartLoc(), L->getHeader()) + OrigLoop->getStartLoc(), + OrigLoop->getHeader()) << "Code-size may be reduced by not forcing " "vectorization, or by source-code modifications " "eliminating the need for runtime checks " @@ -3213,116 +2980,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, AddedSafetyChecks = true; - // We currently don't use LoopVersioning for the actual loop cloning but we - // still use it to add the noalias metadata. - LVer = std::make_unique<LoopVersioning>( - *Legal->getLAI(), - Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, - DT, PSE.getSE()); - LVer->prepareNoAliasMetadata(); return MemCheckBlock; } -Value *InnerLoopVectorizer::emitTransformedIndex( - IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID, BasicBlock *VectorHeader) const { - - SCEVExpander Exp(*SE, DL, "induction"); - auto Step = ID.getStep(); - auto StartValue = ID.getStartValue(); - assert(Index->getType()->getScalarType() == Step->getType() && - "Index scalar type does not match StepValue type"); - - // Note: the IR at this point is broken. We cannot use SE to create any new - // SCEV and then expand it, hoping that SCEV's simplification will give us - // a more optimal code. Unfortunately, attempt of doing so on invalid IR may - // lead to various SCEV crashes. So all we can do is to use builder and rely - // on InstCombine for future simplifications. Here we handle some trivial - // cases only. - auto CreateAdd = [&B](Value *X, Value *Y) { - assert(X->getType() == Y->getType() && "Types don't match!"); - if (auto *CX = dyn_cast<ConstantInt>(X)) - if (CX->isZero()) - return Y; - if (auto *CY = dyn_cast<ConstantInt>(Y)) - if (CY->isZero()) - return X; - return B.CreateAdd(X, Y); - }; - - // We allow X to be a vector type, in which case Y will potentially be - // splatted into a vector with the same element count. - auto CreateMul = [&B](Value *X, Value *Y) { - assert(X->getType()->getScalarType() == Y->getType() && - "Types don't match!"); - if (auto *CX = dyn_cast<ConstantInt>(X)) - if (CX->isOne()) - return Y; - if (auto *CY = dyn_cast<ConstantInt>(Y)) - if (CY->isOne()) - return X; - VectorType *XVTy = dyn_cast<VectorType>(X->getType()); - if (XVTy && !isa<VectorType>(Y->getType())) - Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); - return B.CreateMul(X, Y); - }; - - // Get a suitable insert point for SCEV expansion. For blocks in the vector - // loop, choose the end of the vector loop header (=VectorHeader), because - // the DomTree is not kept up-to-date for additional blocks generated in the - // vector loop. By using the header as insertion point, we guarantee that the - // expanded instructions dominate all their uses. - auto GetInsertPoint = [this, &B, VectorHeader]() { - BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); - if (InsertBB != LoopVectorBody && - LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) - return VectorHeader->getTerminator(); - return &*B.GetInsertPoint(); - }; - - switch (ID.getKind()) { - case InductionDescriptor::IK_IntInduction: { - assert(!isa<VectorType>(Index->getType()) && - "Vector indices not supported for integer inductions yet"); - assert(Index->getType() == StartValue->getType() && - "Index type does not match StartValue type"); - if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) - return B.CreateSub(StartValue, Index); - auto *Offset = CreateMul( - Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); - return CreateAdd(StartValue, Offset); - } - case InductionDescriptor::IK_PtrInduction: { - assert(isa<SCEVConstant>(Step) && - "Expected constant step for pointer induction"); - return B.CreateGEP( - ID.getElementType(), StartValue, - CreateMul(Index, - Exp.expandCodeFor(Step, Index->getType()->getScalarType(), - GetInsertPoint()))); - } - case InductionDescriptor::IK_FpInduction: { - assert(!isa<VectorType>(Index->getType()) && - "Vector indices not supported for FP inductions yet"); - assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); - auto InductionBinOp = ID.getInductionBinOp(); - assert(InductionBinOp && - (InductionBinOp->getOpcode() == Instruction::FAdd || - InductionBinOp->getOpcode() == Instruction::FSub) && - "Original bin op should be defined for FP induction"); - - Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); - Value *MulExp = B.CreateFMul(StepValue, Index); - return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, - "induction"); - } - case InductionDescriptor::IK_NoInduction: - return nullptr; - } - llvm_unreachable("invalid enum"); -} - -Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { +void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); @@ -3354,43 +3015,24 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); - // We intentionally don't let SplitBlock to update LoopInfo since - // LoopVectorBody should belong to another loop than LoopVectorPreHeader. - // LoopVectorBody is explicitly added to the correct place few lines later. - LoopVectorBody = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - nullptr, nullptr, Twine(Prefix) + "vector.body"); - - // Update dominator for loop exit. + // Update dominator for loop exit. During skeleton creation, only the vector + // pre-header and the middle block are created. The vector loop is entirely + // created during VPlan exection. if (!Cost->requiresScalarEpilogue(VF)) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); - - // Create and register the new vector loop. - Loop *Lp = LI->AllocateLoop(); - Loop *ParentLoop = OrigLoop->getParentLoop(); - - // Insert the new loop into the loop nest and register the new basic blocks - // before calling any utilities such as SCEV that require valid LoopInfo. - if (ParentLoop) { - ParentLoop->addChildLoop(Lp); - } else { - LI->addTopLevelLoop(Lp); - } - Lp->addBasicBlockToLoop(LoopVectorBody, *LI); - return Lp; } void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { + std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); - Value *VectorTripCount = getOrCreateVectorTripCount(L); - assert(VectorTripCount && L && "Expected valid arguments"); + Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + assert(VectorTripCount && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3403,19 +3045,13 @@ void InnerLoopVectorizer::createInductionResumeValues( PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; - // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = - PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", - LoopScalarPreHeader->getTerminator()); - // Copy original phi DL over to the new one. - BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. EndValue = VectorTripCount; } else { - IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + IRBuilder<> B(LoopVectorPreHeader->getTerminator()); // Fast-math-flags propagate from the original induction instruction. if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) @@ -3424,10 +3060,10 @@ void InnerLoopVectorizer::createInductionResumeValues( Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(VectorTripCount, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); - const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); + Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); + Value *Step = + CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); + EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3435,13 +3071,23 @@ void InnerLoopVectorizer::createInductionResumeValues( B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, StepType, true); - CRD = - B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + Value *Step = + CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); + VTC = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); + emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); EndValueFromAdditionalBypass->setName("ind.end"); } } + + // Create phi nodes to merge from the backedge-taken check block. + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); + // Copy original phi DL over to the new one. + BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); + // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); @@ -3460,13 +3106,10 @@ void InnerLoopVectorizer::createInductionResumeValues( } } -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, - MDNode *OrigLoopID) { - assert(L && "Expected valid loop."); - +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(L); - Value *VectorTripCount = getOrCreateVectorTripCount(L); + Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3491,14 +3134,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); } - // Get ready to start creating new instructions into the vectorized body. - assert(LoopVectorPreHeader == L->getLoopPreheader() && - "Inconsistent vector loop preheader"); - Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); - LI->verify(*DT); #endif return LoopVectorPreHeader; @@ -3521,7 +3158,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { |/ | | v | [ ] \ - | [ ]_| <-- vector loop. + | [ ]_| <-- vector loop (created during VPlan execution). | | | v \ -[ ] <--- middle-block. @@ -3548,34 +3185,32 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // simply happens to be prone to hitting this in practice. In theory, we // can hit the same issue for any SCEV, or ValueTracking query done during // mutation. See PR49900. - getOrCreateTripCount(OrigLoop); + getOrCreateTripCount(OrigLoop->getLoopPreheader()); // Create an empty vector loop, and prepare basic blocks for the runtime // checks. - Loop *Lp = createVectorLoopSkeleton(""); + createVectorLoopSkeleton(""); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. This check also covers the case where the // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + emitIterationCountCheck(LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, LoopScalarPreHeader); + emitSCEVChecks(LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - createHeaderBranch(Lp); + emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp); + createInductionResumeValues(); - return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; + return {completeLoopSkeleton(OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -3584,8 +3219,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // value for the IV when arriving directly from the middle block. void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *CountRoundDown, Value *EndValue, - BasicBlock *MiddleBlock) { + Value *VectorTripCount, Value *EndValue, + BasicBlock *MiddleBlock, + BasicBlock *VectorHeader, VPlan &Plan) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate // value (the value that feeds into the phi from the loop latch). @@ -3612,8 +3248,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, for (User *U : OrigPhi->users()) { auto *UI = cast<Instruction>(U); if (!OrigLoop->contains(UI)) { - const DataLayout &DL = - OrigLoop->getHeader()->getModule()->getDataLayout(); assert(isa<PHINode>(UI) && "Expected LCSSA form"); IRBuilder<> B(MiddleBlock->getTerminator()); @@ -3623,15 +3257,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); Value *CountMinusOne = B.CreateSub( - CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); + VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); Value *CMO = !II.getStep()->getType()->isIntegerTy() ? B.CreateCast(Instruction::SIToFP, CountMinusOne, II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); + + Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), + VectorHeader->getTerminator()); Value *Escape = - emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); + emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -3644,8 +3281,10 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // In this case, if IV1 has an external use, we need to avoid adding both // "last value of IV1" and "penultimate value of IV2". So, verify that we // don't already have an incoming value for the middle block. - if (PHI->getBasicBlockIndex(MiddleBlock) == -1) + if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { PHI->addIncoming(I.second, MiddleBlock); + Plan.removeLiveOut(PHI); + } } } @@ -3924,18 +3563,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { } } -void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { +void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, + VPlan &Plan) { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF.isVector()) truncateToMinimalBitwidths(State); // Fix widened non-induction PHIs by setting up the PHI operands. - if (OrigPHIsToFix.size()) { - assert(EnableVPlanNativePath && - "Unexpected non-induction PHIs for fixup in non VPlan-native path"); - fixNonInductionPHIs(State); - } + if (EnableVPlanNativePath) + fixNonInductionPHIs(Plan, State); // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI @@ -3946,24 +3583,37 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); - // If we inserted an edge from the middle block to the unique exit block, - // update uses outside the loop (phis) to account for the newly inserted - // edge. - if (!Cost->requiresScalarEpilogue(VF)) { + VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); + if (Cost->requiresScalarEpilogue(VF)) { + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + Plan.clearLiveOuts(); + } else { + // If we inserted an edge from the middle block to the unique exit block, + // update uses outside the loop (phis) to account for the newly inserted + // edge. + // Fix-up external users of the induction variables. for (auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, - getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), - IVEndValues[Entry.first], LoopMiddleBlock); - - fixLCSSAPHIs(State); + getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), + IVEndValues[Entry.first], LoopMiddleBlock, + VectorLoop->getHeader(), Plan); } + // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated + // in the exit block, so update the builder. + State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); + for (auto &KV : Plan.getLiveOuts()) + KV.second->fixPhi(Plan, State); + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); // Remove redundant induction instructions. - cse(LoopVectorBody); + cse(VectorLoop->getHeader()); // Set/update profile weights for the vector and remainder loops as original // loop iterations are now distributed among them. Note that original loop @@ -3978,9 +3628,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // For scalable vectorization we can't know at compile time how many iterations // of the loop are handled in one vector iteration, so instead assume a pessimistic // vscale of '1'. - setProfileInfoAfterUnrolling( - LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); + setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, + LI->getLoopFor(LoopScalarBody), + VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { @@ -3990,7 +3640,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. - VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); + VPBasicBlock *Header = + State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) fixReduction(ReductionPhi, State); @@ -4106,8 +3757,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence( // and thus no phis which needed updated. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) + if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } } void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, @@ -4121,14 +3774,14 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - setDebugLocFromInst(ReductionStartValue); + State.setDebugLocFromInst(ReductionStartValue); VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); // This is the vector-clone of the value that leaves the loop. Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Wrap flags are in general invalid after vectorization, clear them. - clearReductionWrapFlags(RdxDesc, State); + clearReductionWrapFlags(PhiR, State); // Before each round, move the insertion point right between // the PHIs and the values we are going to write. @@ -4136,9 +3789,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - setDebugLocFromInst(LoopExitInst); + State.setDebugLocFromInst(LoopExitInst); Type *PhiTy = OrigPhi->getType(); + + VPBasicBlock *LatchVPBB = + PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); + BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already @@ -4146,17 +3803,20 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); - Value *Sel = nullptr; + SelectInst *Sel = nullptr; for (User *U : VecLoopExitInst->users()) { if (isa<SelectInst>(U)) { assert(!Sel && "Reduction exit feeding two selects"); - Sel = U; + Sel = cast<SelectInst>(U); } else assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); } assert(Sel && "Reduction exit feeds no select"); State.reset(LoopExitInstDef, Sel, Part); + if (isa<FPMathOperator>(Sel)) + Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); + // If the target can create a predicated operator for the reduction at no // extra cost in the loop (for example a predicated vadd), it can be // cheaper for the select to remain in the loop than be sunk out of it, @@ -4168,8 +3828,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, TargetTransformInfo::ReductionFlags())) { auto *VecRdxPhi = cast<PHINode>(State.get(PhiR, Part)); - VecRdxPhi->setIncomingValueForBlock( - LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); + VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); } } } @@ -4180,8 +3839,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - Builder.SetInsertPoint( - LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); + Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = State.get(LoopExitInstDef, Part); @@ -4212,7 +3870,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // conditional branch, and (c) other passes may add new predecessors which // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. - setDebugLocFromInst(LoopMiddleBlock->getTerminator()); + State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); if (PhiR->isOrdered()) ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); else { @@ -4269,6 +3927,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // Set the resume value for this reduction ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + StoreInst *NewSI = + Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + propagateMetadata(NewSI, SI); + + // If the reduction value is used in other places, + // then let the code below create PHI's for that. + } + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -4277,8 +3946,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // fixFirstOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) + if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. @@ -4291,63 +3962,35 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } -void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, +void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, VPTransformState &State) { + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); if (RK != RecurKind::Add && RK != RecurKind::Mul) return; - Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); - assert(LoopExitInstr && "null loop exit instruction"); - SmallVector<Instruction *, 8> Worklist; - SmallPtrSet<Instruction *, 8> Visited; - Worklist.push_back(LoopExitInstr); - Visited.insert(LoopExitInstr); + SmallVector<VPValue *, 8> Worklist; + SmallPtrSet<VPValue *, 8> Visited; + Worklist.push_back(PhiR); + Visited.insert(PhiR); while (!Worklist.empty()) { - Instruction *Cur = Worklist.pop_back_val(); - if (isa<OverflowingBinaryOperator>(Cur)) - for (unsigned Part = 0; Part < UF; ++Part) { - // FIXME: Should not rely on getVPValue at this point. - Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); - cast<Instruction>(V)->dropPoisonGeneratingFlags(); + VPValue *Cur = Worklist.pop_back_val(); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = State.get(Cur, Part); + if (!isa<OverflowingBinaryOperator>(V)) + break; + cast<Instruction>(V)->dropPoisonGeneratingFlags(); } - for (User *U : Cur->users()) { - Instruction *UI = cast<Instruction>(U); - if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && - Visited.insert(UI).second) - Worklist.push_back(UI); - } - } -} - -void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) - // Some phis were already hand updated by the reduction and recurrence - // code above, leave them alone. - continue; - - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - - VPLane Lane = VPLane::getFirstLane(); - if (isa<Instruction>(IncomingValue) && - !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), - VF)) - Lane = VPLane::getLastLaneForVF(VF); - - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - // FIXME: Should not rely on getVPValue at this point. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - OrigLoop->isLoopInvariant(IncomingValue) - ? IncomingValue - : State.get(State.Plan->getVPValue(IncomingValue, true), - VPIteration(UF - 1, Lane)); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); + for (VPUser *U : Cur->users()) { + auto *UserRecipe = dyn_cast<VPRecipeBase>(U); + if (!UserRecipe) + continue; + for (VPValue *V : UserRecipe->definedValues()) + if (Visited.insert(V).second) + Worklist.push_back(V); + } } } @@ -4425,17 +4068,23 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } -void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { - for (PHINode *OrigPhi : OrigPHIsToFix) { - VPWidenPHIRecipe *VPPhi = - cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); - PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); - // Make sure the builder has a valid insert point. - Builder.SetInsertPoint(NewPhi); - for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { - VPValue *Inc = VPPhi->getIncomingValue(i); - VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); - NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); +void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, + VPTransformState &State) { + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { + for (VPRecipeBase &P : VPBB->phis()) { + VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); + if (!VPPhi) + continue; + PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); + // Make sure the builder has a valid insert point. + Builder.SetInsertPoint(NewPhi); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); + } } } } @@ -4445,139 +4094,6 @@ bool InnerLoopVectorizer::useOrderedReductions( return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - VPWidenPHIRecipe *PhiR, - VPTransformState &State) { - PHINode *P = cast<PHINode>(PN); - if (EnableVPlanNativePath) { - // Currently we enter here in the VPlan-native path for non-induction - // PHIs where all control flow is uniform. We simply widen these PHIs. - // Create a vector phi with no operands - the vector phi operands will be - // set at the end of vector code generation. - Type *VecTy = (State.VF.isScalar()) - ? PN->getType() - : VectorType::get(PN->getType(), State.VF); - Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); - State.set(PhiR, VecPhi, 0); - OrigPHIsToFix.push_back(P); - - return; - } - - assert(PN->getParent() == OrigLoop->getHeader() && - "Non-header phis should have been handled elsewhere"); - - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #1: We create a new vector PHI node with no incoming edges. We'll use - // this value when we vectorize all of the instructions that use the PHI. - - assert(!Legal->isReductionVariable(P) && - "reductions should be handled elsewhere"); - - setDebugLocFromInst(P); - - // This PHINode must be an induction variable. - // Make sure that we know about it. - assert(Legal->getInductionVars().count(P) && "Not an induction variable"); - - InductionDescriptor II = Legal->getInductionVars().lookup(P); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - - auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); - PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); - - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - switch (II.getKind()) { - case InductionDescriptor::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case InductionDescriptor::IK_IntInduction: - case InductionDescriptor::IK_FpInduction: - llvm_unreachable("Integer/fp induction is handled elsewhere."); - case InductionDescriptor::IK_PtrInduction: { - // Handle the pointer induction variable case. - assert(P->getType()->isPointerTy() && "Unexpected type."); - - if (Cost->isScalarAfterVectorization(P, State.VF)) { - // This is the normalized GEP that starts counting at zero. - Value *PtrInd = - Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); - assert((IsUniform || !State.VF.isScalable()) && - "Cannot scalarize a scalable VF"); - unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *PartStart = - createStepForVF(Builder, PtrInd->getType(), VF, Part); - - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Value *Idx = Builder.CreateAdd( - PartStart, ConstantInt::get(PtrInd->getType(), Lane)); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), - DL, II, State.CFG.PrevBB); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, VPIteration(Part, Lane)); - } - } - return; - } - assert(isa<SCEVConstant>(II.getStep()) && - "Induction step not a SCEV constant!"); - Type *PhiType = II.getStep()->getType(); - - // Build a pointer phi - Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); - Type *ScStValueType = ScalarStartValue->getType(); - PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); - NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); - - // A pointer induction, performed by using a gep - BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - Instruction *InductionLoc = LoopLatch->getTerminator(); - const SCEV *ScalarStep = II.getStep(); - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Value *ScalarStepValue = - Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); - Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); - Value *NumUnrolledElems = - Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); - Value *InductionGEP = GetElementPtrInst::Create( - II.getElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", - InductionLoc); - NewPointerPhi->addIncoming(InductionGEP, LoopLatch); - - // Create UF many actual address geps that use the pointer - // phi as base and a vectorized version of the step value - // (<step*0, ..., step*N>) as offset. - for (unsigned Part = 0; Part < State.UF; ++Part) { - Type *VecPhiType = VectorType::get(PhiType, State.VF); - Value *StartOffsetScalar = - Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); - Value *StartOffset = - Builder.CreateVectorSplat(State.VF, StartOffsetScalar); - // Create a vector of consecutive numbers from zero to VF. - StartOffset = - Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); - - Value *GEP = Builder.CreateGEP( - II.getElementType(), NewPointerPhi, - Builder.CreateMul( - StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), - "vector.gep")); - State.set(PhiR, GEP, Part); - } - } - } -} - /// A helper function for checking whether an integer division-related /// instruction may divide by zero (in which case it must be predicated if /// executed conditionally in the scalar code). @@ -4601,7 +4117,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPTransformState &State) { assert(!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); - setDebugLocFromInst(&I); + State.setDebugLocFromInst(&I); Module *M = I.getParent()->getParent()->getParent(); auto *CI = cast<CallInst>(&I); @@ -4631,13 +4147,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) + if (!UseVectorIntrinsic || + !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) Arg = State.get(I.value(), Part); - else { + else Arg = State.get(I.value(), VPIteration(0, 0)); - if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) - TysForDecl.push_back(Arg->getType()); - } + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) + TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } @@ -4665,7 +4181,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, V->copyFastMathFlags(CI); State.set(Def, V, Part); - addMetadata(V, &I); + State.addMetadata(V, &I); } } @@ -4676,6 +4192,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); + // This avoids any chances of creating a REPLICATE recipe during planning + // since that would result in generation of scalarized code during execution, + // which is not supported for scalable vectors. + if (VF.isScalable()) { + Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); + return; + } + SmallSetVector<Instruction *, 8> Worklist; // These sets are used to seed the analysis with pointers used by memory @@ -4765,7 +4289,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { } // Insert the forced scalars. - // FIXME: Currently widenPHIInstruction() often creates a dead vector + // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector // induction variable when the PHI user is scalarized. auto ForcedScalar = ForcedScalars.find(VF); if (ForcedScalar != ForcedScalars.end()) @@ -4892,6 +4416,27 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; + // If the group involves a non-integral pointer, we may not be able to + // losslessly cast all values to a common type. + unsigned InterleaveFactor = Group->getFactor(); + bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); + for (unsigned i = 0; i < InterleaveFactor; i++) { + Instruction *Member = Group->getMember(i); + if (!Member) + continue; + auto *MemberTy = getLoadStoreType(Member); + bool MemberNI = DL.isNonIntegralPointerType(MemberTy); + // Don't coerce non-integral pointers to integers or vice versa. + if (MemberNI != ScalarNI) { + // TODO: Consider adding special nullptr value case here + return false; + } else if (MemberNI && ScalarNI && + ScalarTy->getPointerAddressSpace() != + MemberTy->getPointerAddressSpace()) { + return false; + } + } + // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps @@ -5174,7 +4719,7 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return true; } - if (!PSE.getUnionPredicate().getPredicates().empty()) { + if (!PSE.getPredicate().isAlwaysTrue()) { reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", "runtime SCEV checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " @@ -5465,14 +5010,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } } - // For scalable vectors don't use tail folding for low trip counts or - // optimizing for code size. We only permit this if the user has explicitly - // requested it. - if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && - ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && - MaxFactors.ScalableVF.isVector()) - MaxFactors.ScalableVF = ElementCount::getScalable(0); - // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. @@ -5515,7 +5052,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF, bool FoldTailByMasking) { + ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector @@ -5560,9 +5097,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return ElementCount::getFixed(ClampedConstTripCount); } + TargetTransformInfo::RegisterKind RegKind = + ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; - if (TTI.shouldMaximizeVectorBandwidth() || - (MaximizeBandwidth && isScalarEpilogueAllowed())) { + if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && + TTI.shouldMaximizeVectorBandwidth(RegKind))) { auto MaxVectorElementCountMaxBW = ElementCount::get( PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), ComputeScalableMaxVF); @@ -5600,6 +5140,11 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( MaxVF = MinVF; } } + + // Invalidate any widening decisions we might have made, in case the loop + // requires prediction (decided later), but we have already made some + // load/store widening decisions. + invalidateCostModelingDecisions(); } return MaxVF; } @@ -5667,7 +5212,8 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -5685,12 +5231,12 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; if (Optional<unsigned> VScale = getVScaleForTuning()) - AssumedMinimumVscale = VScale.getValue(); + AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale @@ -5878,7 +5424,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0}; + return {ForcedEC, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -5908,7 +5454,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( if (MainLoopVF.isScalable()) { EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); if (Optional<unsigned> VScale = getVScaleForTuning()) - EstimatedRuntimeVF *= VScale.getValue(); + EstimatedRuntimeVF *= *VScale; } for (auto &NextVF : ProfitableVFs) @@ -6144,9 +5690,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, return IC; } - // Note that if we've already vectorized the loop we will have done the - // runtime check and so interleaving won't require further checks. - bool InterleavingRequiresRuntimePointerCheck = + // For any scalar loop that either requires runtime checks or predication we + // are better off leaving this to the unroller. Note that if we've already + // vectorized the loop we will have done the runtime check and so interleaving + // won't require further checks. + bool ScalarInterleavingRequiresPredication = + (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + return Legal->blockNeedsPredication(BB); + })); + bool ScalarInterleavingRequiresRuntimePointerCheck = (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and @@ -6156,7 +5708,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, << "LV: VF is " << VF << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); - if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { + if (!ScalarInterleavingRequiresRuntimePointerCheck && + !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. @@ -6319,16 +5872,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - // A lambda that gets the register usage for the given type and VF. - const auto &TTICapture = TTI; - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { + auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; - InstructionCost::CostType RegUsage = - *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); - assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && - "Nonsensical values for register usage."); - return RegUsage; + return TTI.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -7079,10 +6626,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool TypeNotScalarized = false; if (VF.isVector() && VectorTy->isVectorTy()) { - unsigned NumParts = TTI.getNumberOfParts(VectorTy); - if (NumParts) - TypeNotScalarized = NumParts < VF.getKnownMinValue(); - else + if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { + if (VF.isScalable()) + // <vscale x 1 x iN> is assumed to be profitable over iN because + // scalable registers are a distinct register class from scalar ones. + // If we ever find a target which wants to lower scalable vectors + // back to scalars, we'll need to update this code to explicitly + // ask TTI about the register class uses for each part. + TypeNotScalarized = NumParts <= VF.getKnownMinValue(); + else + TypeNotScalarized = NumParts < VF.getKnownMinValue(); + } else C = InstructionCost::getInvalid(); } return VectorizationCostTy(C, TypeNotScalarized); @@ -7158,8 +6712,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { - assert((isa<LoadInst>(&I) || !VF.isScalable()) && - "Cannot yet scalarize uniform stores"); Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); } @@ -7517,8 +7069,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); - if (Decision == CM_Scalarize) + if (Decision == CM_Scalarize) { + if (VF.isScalable() && isa<StoreInst>(I)) + // We can't scalarize a scalable vector store (even a uniform one + // currently), return an invalid cost so as to prevent vectorization. + return InstructionCost::getInvalid(); Width = ElementCount::getFixed(1); + } } VectorTy = ToVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); @@ -7686,6 +7243,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + StoreInst *SI; + if ((SI = dyn_cast<StoreInst>(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + ValuesToIgnore.insert(&I); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { @@ -7787,7 +7354,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7796,6 +7363,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } +bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { + unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); + return (NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold && + !Hints.allowReordering()) || + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; +} + Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7830,7 +7405,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7864,30 +7439,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); - - // Check if it is profitable to vectorize with runtime checks. - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); - } - } - return SelectedVF; + return CM.selectVectorizationFactor(VFCandidates); } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -7940,17 +7492,36 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, - DominatorTree *DT) { + DominatorTree *DT, + bool IsEpilogueVectorization) { LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); // Perform the actual loop transformation. - // 1. Create a new empty loop. Unlink the old loop and connect the new one. + // 1. Set up the skeleton for vectorization, including vector pre-header and + // middle block. The vector loop is created during VPlan execution. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(); + + // Only use noalias metadata when using memory checks guaranteeing no overlap + // across all iterations. + const LoopAccessInfo *LAI = ILV.Legal->getLAI(); + if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && + !LAI->getRuntimePointerChecking()->getDiffChecks()) { + + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + // TODO: Find a better way to re-use LoopVersioning functionality to add + // metadata. + State.LVer = std::make_unique<LoopVersioning>( + *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, + PSE.getSE()); + State.LVer->prepareNoAliasMetadata(); + } + ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -7966,7 +7537,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 2. Copy and widen instructions from the old loop into the new loop. BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State); + CanonicalIVStartValue, State, + IsEpilogueVectorization); + BestVPlan.execute(&State); // Keep all loop hints from the original loop on the vector loop (we'll @@ -7977,8 +7550,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - Loop *L = LI->getLoopFor(State.CFG.PrevBB); - if (VectorizedLoopID.hasValue()) + VPBasicBlock *HeaderVPBB = + BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) L->setLoopID(VectorizedLoopID.getValue()); else { // Keep all loop hints from the original loop on the vector loop (we'll @@ -7995,7 +7570,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. - ILV.fixVectorizedLoop(State); + ILV.fixVectorizedLoop(State, BestVPlan); ILV.printDebugTracesAtEnd(); } @@ -8066,22 +7641,31 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } std::pair<BasicBlock *, Value *> EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); - Loop *Lp = createVectorLoopSkeleton(""); + + // Workaround! Compute the trip count of the original loop and cache it + // before we start modifying the CFG. This code has a systemic problem + // wherein it tries to run analysis over partially constructed IR; this is + // wrong, and not simply for SCEV. The trip count of the original loop + // simply happens to be prone to hitting this in practice. In theory, we + // can hit the same issue for any SCEV, or ValueTracking query done during + // mutation. See PR49900. + getOrCreateTripCount(OrigLoop->getLoopPreheader()); + createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). EPI.EpilogueIterationCountCheck = - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); + emitIterationCountCheck(LoopScalarPreHeader, true); EPI.EpilogueIterationCountCheck->setName("iter.check"); // Generate the code to check any assumptions that we've made for SCEV // expressions. - EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); + EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case @@ -8090,19 +7674,17 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // trip count. Note: the branch will get updated later on when we vectorize // the epilogue. EPI.MainLoopIterationCountCheck = - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); + emitIterationCountCheck(LoopScalarPreHeader, false); // Generate the induction variable. - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - EPI.VectorTripCount = CountRoundDown; - createHeaderBranch(Lp); + EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, // because the vplan in the second pass still contains the inductions from the // original loop. - return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; + return {completeLoopSkeleton(OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -8122,13 +7704,13 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { }); } -BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( - Loop *L, BasicBlock *Bypass, bool ForEpilogue) { - assert(L && "Expected valid Loop."); +BasicBlock * +EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, + bool ForEpilogue) { assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(L); + Value *Count = getOrCreateTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -8187,7 +7769,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( std::pair<BasicBlock *, Value *> EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); - Loop *Lp = createVectorLoopSkeleton("vec.epilog."); + createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. @@ -8196,7 +7778,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { LoopVectorPreHeader = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, "vec.epilog.ph"); - emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, + emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); // Adjust the control flow taking the state info from the main loop @@ -8268,9 +7850,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), EPI.MainLoopIterationCountCheck); - // Generate the induction variable. - createHeaderBranch(Lp); - // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail // iterations left once the vector loop has completed. @@ -8278,15 +7857,15 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); + createInductionResumeValues({VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); - return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; + return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; } BasicBlock * EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { + BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && "Expected trip count to have been safed in the first pass."); @@ -8427,7 +8006,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // constructing the desired canonical IV in the header block as its first // non-phi instructions. assert(CM.foldTailByMasking() && "must fold the tail"); - VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = + Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); @@ -8469,8 +8049,6 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { - if (VF.isScalar()) - return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && @@ -8507,11 +8085,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Mask, Consecutive, Reverse); } -static VPWidenIntOrFpInductionRecipe * -createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, - VPValue *Start, const InductionDescriptor &IndDesc, - LoopVectorizationCostModel &CM, Loop &OrigLoop, - VFRange &Range) { +/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also +/// insert a recipe to expand the step for the induction recipe. +static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( + PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, + const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { // Returns true if an instruction \p I should be scalarized instead of // vectorized for the chosen vectorization factor. auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { @@ -8519,18 +8098,6 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, CM.isProfitableToScalarize(I, VF); }; - bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - // Returns true if we should generate a scalar version of \p IV. - if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) - return true; - auto isScalarInst = [&](User *U) -> bool { - auto *I = cast<Instruction>(U); - return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); - }; - return any_of(PhiOrTrunc->users(), isScalarInst); - }, - Range); bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return ShouldScalarizeInstruction(PhiOrTrunc, VF); @@ -8538,30 +8105,38 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, Range); assert(IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); + assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && + "step must be loop invariant"); + + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, - NeedsScalarIV, !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, + !NeedsScalarIVOnly); } assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, !NeedsScalarIVOnly); } -VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( - PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { +VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( + PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, - Range); + return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, + *PSE.getSE(), *OrigLoop, Range); + // Check if this is pointer induction. If so, build the recipe for it. + if (auto *II = Legal->getPointerInductionDescriptor(Phi)) + return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, + *PSE.getSE()); return nullptr; } VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( - TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, - VPlan &Plan) const { + TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -8582,7 +8157,8 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); + return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, + *PSE.getSE(), *OrigLoop, Range); } return nullptr; } @@ -8599,13 +8175,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, return Operands[0]; } + unsigned NumIncoming = Phi->getNumIncomingValues(); + // For in-loop reductions, we do not need to create an additional select. + VPValue *InLoopVal = nullptr; + for (unsigned In = 0; In < NumIncoming; In++) { + PHINode *PhiOp = + dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); + if (PhiOp && CM.isInLoopReduction(PhiOp)) { + assert(!InLoopVal && "Found more than one in-loop reduction!"); + InLoopVal = Operands[In]; + } + } + + assert((!InLoopVal || NumIncoming == 2) && + "Found an in-loop reduction for PHI with unexpected number of " + "incoming values"); + if (InLoopVal) + return Operands[Operands[0] == InLoopVal ? 1 : 0]; + // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. SmallVector<VPValue *, 2> OperandsWithMask; - unsigned NumIncoming = Phi->getNumIncomingValues(); for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = @@ -8711,6 +8304,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::URem: case Instruction::Xor: case Instruction::ZExt: + case Instruction::Freeze: return true; } return false; @@ -8836,14 +8430,14 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, Plan->removeVPValueFor(Instr); Plan->addVPValue(Instr, PHIRecipe); } - auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); + VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. - VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); - VPBlockUtils::connectBlocks(Pred, Exit); + VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); + VPBlockUtils::connectBlocks(Pred, Exiting); return Region; } @@ -8852,52 +8446,37 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef<VPValue *> Operands, VFRange &Range, VPlanPtr &Plan) { - // First, check for specific widening recipes that deal with calls, memory - // operations, inductions and Phi nodes. - if (auto *CI = dyn_cast<CallInst>(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); - - if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) - return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); - + // First, check for specific widening recipes that deal with inductions, Phi + // nodes, calls and memory operations. VPRecipeBase *Recipe; if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; - if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { - VPValue *StartV = Operands[0]; - if (Legal->isReductionVariable(Phi)) { - const RecurrenceDescriptor &RdxDesc = - Legal->getReductionVars().find(Phi)->second; - assert(RdxDesc.getRecurrenceStartValue() == - Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, - CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc)); - } else { - PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); - } - - // Record the incoming value from the backedge, so we can add the incoming - // value from the backedge after all recipes have been created. - recordRecipeOf(cast<Instruction>( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); - PhisToFix.push_back(PhiRecipe); + assert((Legal->isReductionVariable(Phi) || + Legal->isFirstOrderRecurrence(Phi)) && + "can only widen reductions and first-order recurrences here"); + VPValue *StartV = Operands[0]; + if (Legal->isReductionVariable(Phi)) { + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; + assert(RdxDesc.getRecurrenceStartValue() == + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, + CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc)); } else { - // TODO: record backedge value for remaining pointer induction phis. - assert(Phi->getType()->isPointerTy() && - "only pointer phis should be handled here"); - assert(Legal->getInductionVars().count(Phi) && - "Not an induction variable"); - InductionDescriptor II = Legal->getInductionVars().lookup(Phi); - VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - PhiRecipe = new VPWidenPHIRecipe(Phi, Start); + PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } + // Record the incoming value from the backedge, so we can add the incoming + // value from the backedge after all recipes have been created. + recordRecipeOf(cast<Instruction>( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + PhisToFix.push_back(PhiRecipe); return toVPRecipeResult(PhiRecipe); } @@ -8906,6 +8485,17 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Range, *Plan))) return toVPRecipeResult(Recipe); + // All widen recipes below deal only with VF > 1. + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { return VF.isScalar(); }, Range)) + return nullptr; + + if (auto *CI = dyn_cast<CallInst>(Instr)) + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + + if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) + return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); + if (!shouldWiden(Instr, Range)) return nullptr; @@ -8979,15 +8569,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a // BranchOnCount VPInstruction to the latch. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, bool IsVPlanNative) { + bool HasNUW) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); - if (IsVPlanNative) - Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); Header->insert(CanonicalIVPHI, Header->begin()); auto *CanonicalIVIncrement = @@ -8996,11 +8584,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, {CanonicalIVPHI}, DL); CanonicalIVPHI->addOperand(CanonicalIVIncrement); - VPBasicBlock *EB = TopRegion->getExitBasicBlock(); - if (IsVPlanNative) { - EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); - EB->setCondBit(nullptr); - } + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); auto *BranchOnCount = @@ -9009,6 +8593,26 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, EB->appendRecipe(BranchOnCount); } +// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the +// original exit block. +static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, + VPBasicBlock *MiddleVPBB, Loop *OrigLoop, + VPlan &Plan) { + BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); + BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); + // Only handle single-exit loops with unique exit blocks for now. + if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) + return; + + // Introduce VPUsers modeling the exit values. + for (PHINode &ExitPhi : ExitBB->phis()) { + Value *IncomingValue = + ExitPhi.getIncomingValueForBlock(ExitingBB); + VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); + Plan.addLiveOut(&ExitPhi, V); + } +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -9037,7 +8641,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.recordRecipeOf(Phi); for (auto &R : ReductionOperations) { RecipeBuilder.recordRecipeOf(R); - // For min/max reducitons, where we have a pair of icmp/select, we also + // For min/max reductions, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); @@ -9069,18 +8673,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, with separate header and latch blocks. - VPBasicBlock *HeaderVPBB = new VPBasicBlock(); + // Create initial VPlan skeleton, starting with a block for the pre-header, + // followed by a region for the vector loop, followed by the middle block. The + // skeleton vector loop region contains a header and latch block. + VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); + auto Plan = std::make_unique<VPlan>(Preheader); + + VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - auto Plan = std::make_unique<VPlan>(TopRegion); + VPBlockUtils::insertBlockAfter(TopRegion, Preheader); + VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); + VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), false); + !CM.foldTailByMasking()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9093,11 +8704,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - VPBB->setName(BB->getName()); + if (VPBB != HeaderVPBB) + VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. - // TODO: Model and preserve debug instrinsics in VPlan. + // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; @@ -9115,6 +8727,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( auto OpRange = Plan->mapToVPValues(Instr->operands()); Operands = {OpRange.begin(), OpRange.end()}; } + + // Invariant stores inside loop will be deleted and a single store + // with the final reduction value will be added to the exit block + StoreInst *SI; + if ((SI = dyn_cast<StoreInst>(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + continue; + if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( Instr, Operands, Range, Plan)) { // If Instr can be simplified to an existing VPValue, use it. @@ -9165,14 +8785,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } + HeaderVPBB->setName("vector.body"); + // Fold the last, empty block into its predecessor. VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); assert(VPBB && "expected to fold last (empty) block"); // After here, VPBB should not be used. VPBB = nullptr; - assert(isa<VPRegionBlock>(Plan->getEntry()) && - !Plan->getEntry()->getEntryBasicBlock()->empty() && + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + + assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && + !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); @@ -9252,12 +8876,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, + adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. - for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { + for (VPRecipeBase &R : + Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); if (!RecurPhi) continue; @@ -9317,13 +8942,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - // From this point onwards, VPlan-to-VPlan transformations may change the plan - // in ways that accessing values using original IR values is incorrect. - Plan->disableValue2VPValue(); - - VPlanTransforms::sinkScalarOperands(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); - std::string PlanName; raw_string_ostream RSO(PlanName); ElementCount VF = Range.Start; @@ -9337,10 +8955,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RSO.flush(); Plan->setName(PlanName); + // From this point onwards, VPlan-to-VPlan transformations may change the plan + // in ways that accessing values using original IR values is incorrect. + Plan->disableValue2VPValue(); + + VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); + VPlanTransforms::sinkScalarOperands(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); + VPlanTransforms::removeDeadRecipes(*Plan); + VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); + // Fold Exit block into its predecessor if possible. // TODO: Fold block earlier once all VPlan transforms properly maintain a // VPBasicBlock as exit. - VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); + VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; @@ -9365,23 +8993,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VF *= 2) Plan->addVF(VF); - if (EnableVPlanPredication) { - VPlanPredicator VPP(*Plan); - VPP.predicate(); - - // Avoid running transformation to recipes until masked code generation in - // VPlan-native path is in place. - return Plan; - } - SmallPtrSet<Instruction *, 1> DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes( OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, DeadInstructions, *PSE.getSE()); + // Remove the existing terminator of the exiting block of the top-most region. + // A BranchOnCount will be added instead when adding the canonical IV recipes. + auto *Term = + Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); + Term->eraseFromParent(); + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, true); + true); return Plan; } @@ -9433,7 +9058,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); - auto *CondOp = CM.foldTailByMasking() + auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; @@ -9453,9 +9078,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); - // Append the recipe to the end of the VPBasicBlock because we need to - // ensure that it comes after all of it's inputs, including CondOp. - WidenRecipe->getParent()->appendRecipe(RedRecipe); + WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); @@ -9477,7 +9100,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // dedicated latch block. if (CM.foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); - for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { + for (VPRecipeBase &R : + Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || PhiR->isInLoop()) continue; @@ -9529,7 +9153,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { void VPWidenSelectRecipe::execute(VPTransformState &State) { auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.ILV->setDebugLocFromInst(&I); + State.setDebugLocFromInst(&I); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -9544,7 +9168,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { Value *Op1 = State.get(getOperand(2), Part); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel, Part); - State.ILV->addMetadata(Sel, &I); + State.addMetadata(Sel, &I); } } @@ -9578,7 +9202,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - State.ILV->setDebugLocFromInst(&I); + State.setDebugLocFromInst(&I); for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector<Value *, 2> Ops; @@ -9601,17 +9225,28 @@ void VPWidenRecipe::execute(VPTransformState &State) { // Use this vector value for all users of the original instruction. State.set(this, V, Part); - State.ILV->addMetadata(V, &I); + State.addMetadata(V, &I); } break; } + case Instruction::Freeze: { + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Op = State.get(getOperand(0), Part); + + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, Part); + } + break; + } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = cast<CmpInst>(&I); - State.ILV->setDebugLocFromInst(Cmp); + State.setDebugLocFromInst(Cmp); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); @@ -9625,7 +9260,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } State.set(this, C, Part); - State.ILV->addMetadata(C, &I); + State.addMetadata(C, &I); } break; @@ -9644,7 +9279,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::FPTrunc: case Instruction::BitCast: { auto *CI = cast<CastInst>(&I); - State.ILV->setDebugLocFromInst(CI); + State.setDebugLocFromInst(CI); /// Vectorize casts. Type *DestTy = (State.VF.isScalar()) @@ -9655,7 +9290,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { Value *A = State.get(getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); State.set(this, Cast, Part); - State.ILV->addMetadata(Cast, &I); + State.addMetadata(Cast, &I); } break; } @@ -9691,7 +9326,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < State.UF; ++Part) { Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); State.set(this, EntryPart, Part); - State.ILV->addMetadata(EntryPart, GEP); + State.addMetadata(EntryPart, GEP); } } else { // If the GEP has at least one loop-varying operand, we are sure to @@ -9729,32 +9364,276 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. - auto *NewGEP = IsInBounds - ? State.Builder.CreateInBoundsGEP( - GEP->getSourceElementType(), Ptr, Indices) - : State.Builder.CreateGEP(GEP->getSourceElementType(), - Ptr, Indices); + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, + Indices, "", IsInBounds); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); State.set(this, NewGEP, Part); - State.ILV->addMetadata(NewGEP, GEP); + State.addMetadata(NewGEP, GEP); } } } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); - State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); + + Value *Start = getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = getInductionDescriptor(); + TruncInst *Trunc = getTruncInst(); + IRBuilderBase &Builder = State.Builder; + assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(State.VF.isVector() && "must have vector VF"); + + // The value from the original loop to which we are mapping the new induction + // variable. + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) + Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); + + // Now do the actual transformations, and start with fetching the step value. + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // Construct the initial value of the vector IV in the vector loop preheader + auto CurrIP = Builder.saveIP(); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Builder.SetInsertPoint(VectorPH->getTerminator()); + if (isa<TruncInst>(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); + auto *TruncType = cast<IntegerType>(EntryVal->getType()); + Step = Builder.CreateTrunc(Step, TruncType); + Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); + } + + Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't + // handle a constant vector splat. + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + Builder.restoreIP(CurrIP); + + // We may need to add the step a number of times, depending on the unroll + // factor. The last of those goes into the PHI. + PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", + &*State.CFG.PrevBB->getFirstInsertionPt()); + VecInd->setDebugLoc(EntryVal->getDebugLoc()); + Instruction *LastInduction = VecInd; + for (unsigned Part = 0; Part < State.UF; ++Part) { + State.set(this, LastInduction, Part); + + if (isa<TruncInst>(EntryVal)) + State.addMetadata(LastInduction, EntryVal); + + LastInduction = cast<Instruction>( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); + } + + LastInduction->setName("vec.ind.next"); + VecInd->addIncoming(SteppedStart, VectorPH); + // Add induction update using an incorrect block temporarily. The phi node + // will be fixed after VPlan execution. Note that at this point the latch + // block cannot be used, as it does not exist yet. + // TODO: Model increment value in VPlan, by turning the recipe into a + // multi-def and a subclass of VPHeaderPHIRecipe. + VecInd->addIncoming(LastInduction, VectorPH); +} + +void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { + assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && + "Not a pointer induction according to InductionDescriptor!"); + assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && + "Unexpected type."); + + auto *IVR = getParent()->getPlan()->getCanonicalIV(); + PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); + + if (onlyScalarsGenerated(State.VF)) { + // This is the normalized GEP that starts counting at zero. + Value *PtrInd = State.Builder.CreateSExtOrTrunc( + CanonicalIV, IndDesc.getStep()->getType()); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + bool IsUniform = vputils::onlyFirstLaneUsed(this); + assert((IsUniform || !State.VF.isScalable()) && + "Cannot scalarize a scalable VF"); + unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *PartStart = + createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); + + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Value *Idx = State.Builder.CreateAdd( + PartStart, ConstantInt::get(PtrInd->getType(), Lane)); + Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); + + Value *Step = CreateStepValue(IndDesc.getStep(), SE, + State.CFG.PrevBB->getTerminator()); + Value *SclrGep = emitTransformedIndex( + State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); + SclrGep->setName("next.gep"); + State.set(this, SclrGep, VPIteration(Part, Lane)); + } + } + return; + } + + assert(isa<SCEVConstant>(IndDesc.getStep()) && + "Induction step not a SCEV constant!"); + Type *PhiType = IndDesc.getStep()->getType(); + + // Build a pointer phi + Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); + Type *ScStValueType = ScalarStartValue->getType(); + PHINode *NewPointerPhi = + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); + + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + + // A pointer induction, performed by using a gep + const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); + Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); + + const SCEV *ScalarStep = IndDesc.getStep(); + SCEVExpander Exp(SE, DL, "induction"); + Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); + Value *NumUnrolledElems = + State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); + Value *InductionGEP = GetElementPtrInst::Create( + IndDesc.getElementType(), NewPointerPhi, + State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", + InductionLoc); + // Add induction update using an incorrect block temporarily. The phi node + // will be fixed after VPlan execution. Note that at this point the latch + // block cannot be used, as it does not exist yet. + // TODO: Model increment value in VPlan, by turning the recipe into a + // multi-def and a subclass of VPHeaderPHIRecipe. + NewPointerPhi->addIncoming(InductionGEP, VectorPH); + + // Create UF many actual address geps that use the pointer + // phi as base and a vectorized version of the step value + // (<step*0, ..., step*N>) as offset. + for (unsigned Part = 0; Part < State.UF; ++Part) { + Type *VecPhiType = VectorType::get(PhiType, State.VF); + Value *StartOffsetScalar = + State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); + Value *StartOffset = + State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); + // Create a vector of consecutive numbers from zero to VF. + StartOffset = State.Builder.CreateAdd( + StartOffset, State.Builder.CreateStepVector(VecPhiType)); + + Value *GEP = State.Builder.CreateGEP( + IndDesc.getElementType(), NewPointerPhi, + State.Builder.CreateMul( + StartOffset, + State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), + "vector.gep")); + State.set(this, GEP, Part); + } } -void VPWidenPHIRecipe::execute(VPTransformState &State) { - State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, - State); +void VPScalarIVStepsRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); + + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); + if (IndDesc.getInductionBinOp() && + isa<FPMathOperator>(IndDesc.getInductionBinOp())) + State.Builder.setFastMathFlags( + IndDesc.getInductionBinOp()->getFastMathFlags()); + + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + auto CreateScalarIV = [&](Value *&Step) -> Value * { + Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); + auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); + if (!isCanonical() || CanonicalIV->getType() != Ty) { + ScalarIV = + Ty->isIntegerTy() + ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) + : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); + ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, + getStartValue()->getLiveInIRValue(), Step, + IndDesc); + ScalarIV->setName("offset.idx"); + } + if (TruncToTy) { + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); + Step = State.Builder.CreateTrunc(Step, TruncToTy); + } + return ScalarIV; + }; + + Value *ScalarIV = CreateScalarIV(Step); + if (State.VF.isVector()) { + buildScalarSteps(ScalarIV, Step, IndDesc, this, State); + return; + } + + for (unsigned Part = 0; Part < State.UF; ++Part) { + assert(!State.VF.isScalable() && "scalable vectors not yet supported."); + Value *EntryPart; + if (Step->getType()->isFloatingPointTy()) { + Value *StartIdx = + getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); + // Floating-point operations inherit FMF via the builder's flags. + Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); + EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), + ScalarIV, MulOp); + } else { + Value *StartIdx = + getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); + EntryPart = State.Builder.CreateAdd( + ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); + } + State.set(this, EntryPart, Part); + } } void VPBlendRecipe::execute(VPTransformState &State) { - State.ILV->setDebugLocFromInst(Phi, &State.Builder); + State.setDebugLocFromInst(Phi); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. @@ -10015,7 +9894,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // Handle Stores: if (SI) { - State.ILV->setDebugLocFromInst(SI); + State.setDebugLocFromInst(SI); for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; @@ -10041,14 +9920,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } - State.ILV->addMetadata(NewSI, SI); + State.addMetadata(NewSI, SI); } return; } // Handle loads. assert(LI && "Must have a load instruction"); - State.ILV->setDebugLocFromInst(LI); + State.setDebugLocFromInst(LI); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { @@ -10056,7 +9935,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); - State.ILV->addMetadata(NewLI, LI); + State.addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); @@ -10069,12 +9948,12 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. - State.ILV->addMetadata(NewLI, LI); + State.addMetadata(NewLI, LI); if (Reverse) NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } - State.set(this, NewLI, Part); + State.set(getVPSingleValue(), NewLI, Part); } } @@ -10155,7 +10034,8 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, {Part, LastLane})) { // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. - assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && + assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || + isa<VPScalarIVStepsRecipe>(Def->getDef())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; @@ -10237,8 +10117,7 @@ static bool processLoopInVPlanNativePath( // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. // Also, do not attempt to vectorize if no vector code will be produced. - if (VPlanBuildStressTest || EnableVPlanPredication || - VectorizationFactor::Disabled() == VF) + if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); @@ -10250,7 +10129,7 @@ static bool processLoopInVPlanNativePath( &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); + LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -10318,8 +10197,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ - LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" - << L->getHeader()->getParent()->getName() << "\" from " + LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" + << L->getHeader()->getParent()->getName() << "' from " << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); @@ -10474,10 +10353,30 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); if (MaybeVF) { + if (LVP.requiresTooManyRuntimeChecks()) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + return false; + } VF = *MaybeVF; // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Optimistically generate runtime checks if they are needed. Drop them if + // they turn out to not be profitable. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10565,14 +10464,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); - using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -10582,7 +10473,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); + LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), @@ -10607,12 +10498,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, - DT); + DT, true); ++LoopsVectorized; - simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); - formLCSSARecursively(*L, *DT, LI, SE); - // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; @@ -10622,23 +10510,24 @@ bool LoopVectorizePass::processLoop(Loop *L) { Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); + Header->setName("vec.epilog.vector.body"); // Ensure that the start values for any VPReductionPHIRecipes are // updated before vectorising the epilogue loop. - VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { if (auto *Resume = MainILV.getReductionResumeValue( ReductionPhi->getRecurrenceDescriptor())) { - VPValue *StartVal = new VPValue(Resume); - BestEpiPlan.addExternalDef(StartVal); + VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); ReductionPhi->setOperand(0, StartVal); } } } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT); + DT, true); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) @@ -10648,7 +10537,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { &LVL, &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there @@ -10674,7 +10563,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { Optional<MDNode *> RemainderLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); - if (RemainderLoopID.hasValue()) { + if (RemainderLoopID) { L->setLoopID(RemainderLoopID.getValue()); } else { if (DisableRuntimeUnroll) @@ -10756,8 +10645,12 @@ LoopVectorizeResult LoopVectorizePass::runImpl( PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { - auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); + // There are no loops in the function. Return before computing other expensive + // analyses. + if (LI.empty()) + return PreservedAnalyses::all(); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 644372483edd..019a09665a67 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -53,7 +53,6 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -64,7 +63,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" -#include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -72,8 +70,9 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#ifdef EXPENSIVE_CHECKS #include "llvm/IR/Verifier.h" -#include "llvm/InitializePasses.h" +#endif #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -87,6 +86,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> @@ -164,13 +164,14 @@ static cl::opt<int> LookAheadMaxDepth( "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores")); -// The Look-ahead heuristic goes through the users of the bundle to calculate -// the users cost in getExternalUsesCost(). To avoid compilation time increase -// we limit the number of users visited to this value. -static cl::opt<unsigned> LookAheadUsersBudget( - "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, - cl::desc("The maximum number of users to visit while visiting the " - "predecessors. This prevents compilation time increase.")); +// The maximum depth that the look-ahead score heuristic will explore +// when it probing among candidates for vectorization tree roots. +// The higher this value, the higher the compilation time overhead but unlike +// similar limit for operands ordering this is less frequently used, hence +// impact of higher value is less noticeable. +static cl::opt<int> RootLookAheadMaxDepth( + "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for searching best rooting option")); static cl::opt<bool> ViewSLPTree("view-slp-tree", cl::Hidden, @@ -471,17 +472,36 @@ static bool isValidForAlternation(unsigned Opcode) { return true; } +static InstructionsState getSameOpcode(ArrayRef<Value *> VL, + unsigned BaseIndex = 0); + +/// Checks if the provided operands of 2 cmp instructions are compatible, i.e. +/// compatible instructions or constants, or just some other regular values. +static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, + Value *Op1) { + return (isConstant(BaseOp0) && isConstant(Op0)) || + (isConstant(BaseOp1) && isConstant(Op1)) || + (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && + !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || + getSameOpcode({BaseOp0, Op0}).getOpcode() || + getSameOpcode({BaseOp1, Op1}).getOpcode(); +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef<Value *> VL, - unsigned BaseIndex = 0) { + unsigned BaseIndex) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); bool IsCastOp = isa<CastInst>(VL[BaseIndex]); bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); + bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]); + CmpInst::Predicate BasePred = + IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate() + : CmpInst::BAD_ICMP_PREDICATE; unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; @@ -514,6 +534,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, continue; } } + } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) { + auto *BaseInst = cast<Instruction>(VL[BaseIndex]); + auto *Inst = cast<Instruction>(VL[Cnt]); + Type *Ty0 = BaseInst->getOperand(0)->getType(); + Type *Ty1 = Inst->getOperand(0)->getType(); + if (Ty0 == Ty1) { + Value *BaseOp0 = BaseInst->getOperand(0); + Value *BaseOp1 = BaseInst->getOperand(1); + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + CmpInst::Predicate CurrentPred = + cast<CmpInst>(VL[Cnt])->getPredicate(); + CmpInst::Predicate SwappedCurrentPred = + CmpInst::getSwappedPredicate(CurrentPred); + // Check for compatible operands. If the corresponding operands are not + // compatible - need to perform alternate vectorization. + if (InstOpcode == Opcode) { + if (BasePred == CurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) + continue; + if (BasePred == SwappedCurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)) + continue; + if (E == 2 && + (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) + continue; + auto *AltInst = cast<CmpInst>(VL[AltIndex]); + CmpInst::Predicate AltPred = AltInst->getPredicate(); + Value *AltOp0 = AltInst->getOperand(0); + Value *AltOp1 = AltInst->getOperand(1); + // Check if operands are compatible with alternate operands. + if (AltPred == CurrentPred && + areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1)) + continue; + if (AltPred == SwappedCurrentPred && + areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0)) + continue; + } + if (BaseIndex == AltIndex && BasePred != CurrentPred) { + assert(isValidForAlternation(Opcode) && + isValidForAlternation(InstOpcode) && + "Cast isn't safe for alternation, logic needs to be updated!"); + AltIndex = Cnt; + continue; + } + auto *AltInst = cast<CmpInst>(VL[AltIndex]); + CmpInst::Predicate AltPred = AltInst->getPredicate(); + if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || + AltPred == CurrentPred || AltPred == SwappedCurrentPred) + continue; + } } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; return InstructionsState(VL[BaseIndex], nullptr, nullptr); @@ -570,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { - if (hasVectorInstrinsicScalarOpd(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) return (CI->getArgOperand(i) == Scalar); } LLVM_FALLTHROUGH; @@ -666,11 +737,11 @@ static void inversePermutation(ArrayRef<unsigned> Indices, /// \returns inserting index of InsertElement or InsertValue instruction, /// using Offset as base offset for index. -static Optional<unsigned> getInsertIndex(Value *InsertInst, +static Optional<unsigned> getInsertIndex(const Value *InsertInst, unsigned Offset = 0) { int Index = Offset; - if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { - if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { + if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { auto *VT = cast<FixedVectorType>(IE->getType()); if (CI->getValue().uge(VT->getNumElements())) return None; @@ -681,13 +752,13 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst, return None; } - auto *IV = cast<InsertValueInst>(InsertInst); + const auto *IV = cast<InsertValueInst>(InsertInst); Type *CurrentType = IV->getType(); for (unsigned I : IV->indices()) { - if (auto *ST = dyn_cast<StructType>(CurrentType)) { + if (const auto *ST = dyn_cast<StructType>(CurrentType)) { Index *= ST->getNumElements(); CurrentType = ST->getElementType(I); - } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { + } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) { Index *= AT->getNumElements(); CurrentType = AT->getElementType(); } else { @@ -698,11 +769,7 @@ static Optional<unsigned> getInsertIndex(Value *InsertInst, return Index; } -/// Reorders the list of scalars in accordance with the given \p Order and then -/// the \p Mask. \p Order - is the original order of the scalars, need to -/// reorder scalars into an unordered state at first according to the given -/// order. Then the ordered scalars are shuffled once again in accordance with -/// the provided mask. +/// Reorders the list of scalars in accordance with the given \p Mask. static void reorderScalars(SmallVectorImpl<Value *> &Scalars, ArrayRef<int> Mask) { assert(!Mask.empty() && "Expected non-empty mask."); @@ -714,6 +781,58 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars, Scalars[Mask[I]] = Prev[I]; } +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all operands are either not instructions +/// or phi nodes or instructions from different blocks. +static bool areAllOperandsNonInsts(Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + return true; + return !mayHaveNonDefUseDependency(*I) && + all_of(I->operands(), [I](Value *V) { + auto *IO = dyn_cast<Instruction>(V); + if (!IO) + return true; + return isa<PHINode>(IO) || IO->getParent() != I->getParent(); + }); +} + +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all users are phi nodes or instructions +/// from the different blocks. +static bool isUsedOutsideBlock(Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + return true; + // Limits the number of uses to save compile time. + constexpr int UsesLimit = 8; + return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && + all_of(I->users(), [I](User *U) { + auto *IU = dyn_cast<Instruction>(U); + if (!IU) + return true; + return IU->getParent() != I->getParent() || isa<PHINode>(IU); + }); +} + +/// Checks if the specified value does not require scheduling. It does not +/// require scheduling if all operands and all users do not need to be scheduled +/// in the current basic block. +static bool doesNotNeedToBeScheduled(Value *V) { + return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); +} + +/// Checks if the specified array of instructions does not require scheduling. +/// It is so if all either instructions have operands that do not require +/// scheduling or their users do not require scheduling since they are phis or +/// in other basic blocks. +static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) { + return !VL.empty() && + (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -734,8 +853,8 @@ public: TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) - : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), - DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { + : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), + DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -776,7 +895,10 @@ public: /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef<Value *> Roots, - ArrayRef<Value *> UserIgnoreLst = None); + const SmallDenseSet<Value *> &UserIgnoreLst); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef<Value *> Roots); /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p @@ -797,6 +919,7 @@ public: } MinBWs.clear(); InstrElementSize.clear(); + UserIgnoreList = nullptr; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -810,6 +933,9 @@ public: /// ExtractElement, ExtractValue), which can be part of the graph. Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + /// Sort loads into increasing pointers offsets to allow greater clustering. + Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. @@ -924,96 +1050,18 @@ public: #endif }; - /// A helper data structure to hold the operands of a vector of instructions. - /// This supports a fixed vector length for all operand vectors. - class VLOperands { - /// For each operand we need (i) the value, and (ii) the opcode that it - /// would be attached to if the expression was in a left-linearized form. - /// This is required to avoid illegal operand reordering. - /// For example: - /// \verbatim - /// 0 Op1 - /// |/ - /// Op1 Op2 Linearized + Op2 - /// \ / ----------> |/ - /// - - - /// - /// Op1 - Op2 (0 + Op1) - Op2 - /// \endverbatim - /// - /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. - /// - /// Another way to think of this is to track all the operations across the - /// path from the operand all the way to the root of the tree and to - /// calculate the operation that corresponds to this path. For example, the - /// path from Op2 to the root crosses the RHS of the '-', therefore the - /// corresponding operation is a '-' (which matches the one in the - /// linearized tree, as shown above). - /// - /// For lack of a better term, we refer to this operation as Accumulated - /// Path Operation (APO). - struct OperandData { - OperandData() = default; - OperandData(Value *V, bool APO, bool IsUsed) - : V(V), APO(APO), IsUsed(IsUsed) {} - /// The operand value. - Value *V = nullptr; - /// TreeEntries only allow a single opcode, or an alternate sequence of - /// them (e.g, +, -). Therefore, we can safely use a boolean value for the - /// APO. It is set to 'true' if 'V' is attached to an inverse operation - /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise - /// (e.g., Add/Mul) - bool APO = false; - /// Helper data for the reordering function. - bool IsUsed = false; - }; - - /// During operand reordering, we are trying to select the operand at lane - /// that matches best with the operand at the neighboring lane. Our - /// selection is based on the type of value we are looking for. For example, - /// if the neighboring lane has a load, we need to look for a load that is - /// accessing a consecutive address. These strategies are summarized in the - /// 'ReorderingMode' enumerator. - enum class ReorderingMode { - Load, ///< Matching loads to consecutive memory addresses - Opcode, ///< Matching instructions based on opcode (same or alternate) - Constant, ///< Matching constants - Splat, ///< Matching the same instruction multiple times (broadcast) - Failed, ///< We failed to create a vectorizable group - }; - - using OperandDataVec = SmallVector<OperandData, 2>; - - /// A vector of operand vectors. - SmallVector<OperandDataVec, 4> OpsVec; - + /// A helper class used for scoring candidates for two consecutive lanes. + class LookAheadHeuristics { const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; + int NumLanes; // Total number of lanes (aka vectorization factor). + int MaxLevel; // The maximum recursion depth for accumulating score. - /// \returns the operand data at \p OpIdx and \p Lane. - OperandData &getData(unsigned OpIdx, unsigned Lane) { - return OpsVec[OpIdx][Lane]; - } - - /// \returns the operand data at \p OpIdx and \p Lane. Const version. - const OperandData &getData(unsigned OpIdx, unsigned Lane) const { - return OpsVec[OpIdx][Lane]; - } - - /// Clears the used flag for all entries. - void clearUsed() { - for (unsigned OpIdx = 0, NumOperands = getNumOperands(); - OpIdx != NumOperands; ++OpIdx) - for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; - ++Lane) - OpsVec[OpIdx][Lane].IsUsed = false; - } - - /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. - void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { - std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); - } + public: + LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE, + const BoUpSLP &R, int NumLanes, int MaxLevel) + : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {} // The hard-coded scores listed here are not very important, though it shall // be higher for better matches to improve the resulting cost. When @@ -1028,6 +1076,11 @@ public: /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). static const int ScoreConsecutiveLoads = 4; + /// The same load multiple times. This should have a better score than + /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it + /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for + /// a vector load and 1.0 for a broadcast. + static const int ScoreSplatLoads = 3; /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). static const int ScoreReversedLoads = 3; /// ExtractElementInst from same vector and consecutive indexes. @@ -1046,43 +1099,67 @@ public: static const int ScoreUndef = 1; /// Score for failing to find a decent match. static const int ScoreFail = 0; - /// User exteranl to the vectorized code. - static const int ExternalUseCost = 1; - /// The user is internal but in a different lane. - static const int UserInDiffLaneCost = ExternalUseCost; + /// Score if all users are vectorized. + static const int ScoreAllUserVectorized = 1; /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. - static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, - ScalarEvolution &SE, int NumLanes) { - if (V1 == V2) - return VLOperands::ScoreSplat; + /// \p U1 and \p U2 are the users of \p V1 and \p V2. + /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p + /// MainAltOps. + int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, + ArrayRef<Value *> MainAltOps) const { + if (V1 == V2) { + if (isa<LoadInst>(V1)) { + // Retruns true if the users of V1 and V2 won't need to be extracted. + auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) { + // Bail out if we have too many uses to save compilation time. + static constexpr unsigned Limit = 8; + if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit)) + return false; + + auto AllUsersVectorized = [U1, U2, this](Value *V) { + return llvm::all_of(V->users(), [U1, U2, this](Value *U) { + return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr; + }); + }; + return AllUsersVectorized(V1) && AllUsersVectorized(V2); + }; + // A broadcast of a load can be cheaper on some targets. + if (R.TTI->isLegalBroadcastLoad(V1->getType(), + ElementCount::getFixed(NumLanes)) && + ((int)V1->getNumUses() == NumLanes || + AllUsersAreInternal(V1, V2))) + return LookAheadHeuristics::ScoreSplatLoads; + } + return LookAheadHeuristics::ScoreSplat; + } auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { if (LI1->getParent() != LI2->getParent()) - return VLOperands::ScoreFail; + return LookAheadHeuristics::ScoreFail; Optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - if (!Dist) - return VLOperands::ScoreFail; + if (!Dist || *Dist == 0) + return LookAheadHeuristics::ScoreFail; // The distance is too large - still may be profitable to use masked // loads/gathers. if (std::abs(*Dist) > NumLanes / 2) - return VLOperands::ScoreAltOpcodes; + return LookAheadHeuristics::ScoreAltOpcodes; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce // better results. It should not affect current vectorization. - return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreReversedLoads; + return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads + : LookAheadHeuristics::ScoreReversedLoads; } auto *C1 = dyn_cast<Constant>(V1); auto *C2 = dyn_cast<Constant>(V2); if (C1 && C2) - return VLOperands::ScoreConstants; + return LookAheadHeuristics::ScoreConstants; // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. @@ -1091,7 +1168,7 @@ public: if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { // Undefs are always profitable for extractelements. if (isa<UndefValue>(V2)) - return VLOperands::ScoreConsecutiveExtracts; + return LookAheadHeuristics::ScoreConsecutiveExtracts; Value *EV2 = nullptr; ConstantInt *Ex2Idx = nullptr; if (match(V2, @@ -1099,108 +1176,62 @@ public: m_Undef())))) { // Undefs are always profitable for extractelements. if (!Ex2Idx) - return VLOperands::ScoreConsecutiveExtracts; + return LookAheadHeuristics::ScoreConsecutiveExtracts; if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) - return VLOperands::ScoreConsecutiveExtracts; + return LookAheadHeuristics::ScoreConsecutiveExtracts; if (EV2 == EV1) { int Idx1 = Ex1Idx->getZExtValue(); int Idx2 = Ex2Idx->getZExtValue(); int Dist = Idx2 - Idx1; // The distance is too large - still may be profitable to use // shuffles. + if (std::abs(Dist) == 0) + return LookAheadHeuristics::ScoreSplat; if (std::abs(Dist) > NumLanes / 2) - return VLOperands::ScoreAltOpcodes; - return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts - : VLOperands::ScoreReversedExtracts; + return LookAheadHeuristics::ScoreSameOpcode; + return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts + : LookAheadHeuristics::ScoreReversedExtracts; } + return LookAheadHeuristics::ScoreAltOpcodes; } + return LookAheadHeuristics::ScoreFail; } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (I1 && I2) { if (I1->getParent() != I2->getParent()) - return VLOperands::ScoreFail; - InstructionsState S = getSameOpcode({I1, I2}); + return LookAheadHeuristics::ScoreFail; + SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end()); + Ops.push_back(I1); + Ops.push_back(I2); + InstructionsState S = getSameOpcode(Ops); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. - if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) - return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes - : VLOperands::ScoreSameOpcode; + if (S.getOpcode() && + (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() || + !S.isAltShuffle()) && + all_of(Ops, [&S](Value *V) { + return cast<Instruction>(V)->getNumOperands() == + S.MainOp->getNumOperands(); + })) + return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes + : LookAheadHeuristics::ScoreSameOpcode; } if (isa<UndefValue>(V2)) - return VLOperands::ScoreUndef; - - return VLOperands::ScoreFail; - } - - /// Holds the values and their lanes that are taking part in the look-ahead - /// score calculation. This is used in the external uses cost calculation. - /// Need to hold all the lanes in case of splat/broadcast at least to - /// correctly check for the use in the different lane. - SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues; - - /// \returns the additional cost due to uses of \p LHS and \p RHS that are - /// either external to the vectorized code, or require shuffling. - int getExternalUsesCost(const std::pair<Value *, int> &LHS, - const std::pair<Value *, int> &RHS) { - int Cost = 0; - std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; - for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { - Value *V = Values[Idx].first; - if (isa<Constant>(V)) { - // Since this is a function pass, it doesn't make semantic sense to - // walk the users of a subclass of Constant. The users could be in - // another function, or even another module that happens to be in - // the same LLVMContext. - continue; - } + return LookAheadHeuristics::ScoreUndef; - // Calculate the absolute lane, using the minimum relative lane of LHS - // and RHS as base and Idx as the offset. - int Ln = std::min(LHS.second, RHS.second) + Idx; - assert(Ln >= 0 && "Bad lane calculation"); - unsigned UsersBudget = LookAheadUsersBudget; - for (User *U : V->users()) { - if (const TreeEntry *UserTE = R.getTreeEntry(U)) { - // The user is in the VectorizableTree. Check if we need to insert. - int UserLn = UserTE->findLaneForValue(U); - assert(UserLn >= 0 && "Bad lane"); - // If the values are different, check just the line of the current - // value. If the values are the same, need to add UserInDiffLaneCost - // only if UserLn does not match both line numbers. - if ((LHS.first != RHS.first && UserLn != Ln) || - (LHS.first == RHS.first && UserLn != LHS.second && - UserLn != RHS.second)) { - Cost += UserInDiffLaneCost; - break; - } - } else { - // Check if the user is in the look-ahead code. - auto It2 = InLookAheadValues.find(U); - if (It2 != InLookAheadValues.end()) { - // The user is in the look-ahead code. Check the lane. - if (!It2->getSecond().contains(Ln)) { - Cost += UserInDiffLaneCost; - break; - } - } else { - // The user is neither in SLP tree nor in the look-ahead code. - Cost += ExternalUseCost; - break; - } - } - // Limit the number of visited uses to cap compilation time. - if (--UsersBudget == 0) - break; - } - } - return Cost; + return LookAheadHeuristics::ScoreFail; } - /// Go through the operands of \p LHS and \p RHS recursively until \p - /// MaxLevel, and return the cummulative score. For example: + /// Go through the operands of \p LHS and \p RHS recursively until + /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are + /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands + /// of \p U1 and \p U2), except at the beginning of the recursion where + /// these are set to nullptr. + /// + /// For example: /// \verbatim /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] /// \ / \ / \ / \ / @@ -1211,8 +1242,8 @@ public: /// each level recursively, accumulating the score. It starts from matching /// the additions at level 0, then moves on to the loads (level 1). The /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and - /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while - /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. /// Please note that the order of the operands does not matter, as we /// evaluate the score of all profitable combinations of operands. In /// other words the score of G1 and G4 is the same as G1 and G2. This @@ -1220,18 +1251,13 @@ public: /// Look-ahead SLP: Auto-vectorization in the presence of commutative /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, /// LuÃs F. W. Góes - int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, - const std::pair<Value *, int> &RHS, int CurrLevel, - int MaxLevel) { + int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, + Instruction *U2, int CurrLevel, + ArrayRef<Value *> MainAltOps) const { - Value *V1 = LHS.first; - Value *V2 = RHS.first; // Get the shallow score of V1 and V2. - int ShallowScoreAtThisLevel = std::max( - (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) - - getExternalUsesCost(LHS, RHS)); - int Lane1 = LHS.second; - int Lane2 = RHS.second; + int ShallowScoreAtThisLevel = + getShallowScore(LHS, RHS, U1, U2, MainAltOps); // If reached MaxLevel, // or if V1 and V2 are not instructions, @@ -1239,20 +1265,17 @@ public: // or if they are not consecutive, // or if profitable to vectorize loads or extractelements, early return // the current cost. - auto *I1 = dyn_cast<Instruction>(V1); - auto *I2 = dyn_cast<Instruction>(V2); + auto *I1 = dyn_cast<Instruction>(LHS); + auto *I2 = dyn_cast<Instruction>(RHS); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || - ShallowScoreAtThisLevel == VLOperands::ScoreFail || + ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || + (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && ShallowScoreAtThisLevel)) return ShallowScoreAtThisLevel; assert(I1 && I2 && "Should have early exited."); - // Keep track of in-tree values for determining the external-use cost. - InLookAheadValues[V1].insert(Lane1); - InLookAheadValues[V2].insert(Lane2); - // Contains the I2 operand indexes that got matched with I1 operands. SmallSet<unsigned, 4> Op2Used; @@ -1275,11 +1298,12 @@ public: if (Op2Used.count(OpIdx2)) continue; // Recursively calculate the cost at each level - int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, - {I2->getOperand(OpIdx2), Lane2}, - CurrLevel + 1, MaxLevel); + int TmpScore = + getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), + I1, I2, CurrLevel + 1, None); // Look for the best score. - if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + if (TmpScore > LookAheadHeuristics::ScoreFail && + TmpScore > MaxTmpScore) { MaxTmpScore = TmpScore; MaxOpIdx2 = OpIdx2; FoundBest = true; @@ -1293,24 +1317,213 @@ public: } return ShallowScoreAtThisLevel; } + }; + /// A helper data structure to hold the operands of a vector of instructions. + /// This supports a fixed vector length for all operand vectors. + class VLOperands { + /// For each operand we need (i) the value, and (ii) the opcode that it + /// would be attached to if the expression was in a left-linearized form. + /// This is required to avoid illegal operand reordering. + /// For example: + /// \verbatim + /// 0 Op1 + /// |/ + /// Op1 Op2 Linearized + Op2 + /// \ / ----------> |/ + /// - - + /// + /// Op1 - Op2 (0 + Op1) - Op2 + /// \endverbatim + /// + /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. + /// + /// Another way to think of this is to track all the operations across the + /// path from the operand all the way to the root of the tree and to + /// calculate the operation that corresponds to this path. For example, the + /// path from Op2 to the root crosses the RHS of the '-', therefore the + /// corresponding operation is a '-' (which matches the one in the + /// linearized tree, as shown above). + /// + /// For lack of a better term, we refer to this operation as Accumulated + /// Path Operation (APO). + struct OperandData { + OperandData() = default; + OperandData(Value *V, bool APO, bool IsUsed) + : V(V), APO(APO), IsUsed(IsUsed) {} + /// The operand value. + Value *V = nullptr; + /// TreeEntries only allow a single opcode, or an alternate sequence of + /// them (e.g, +, -). Therefore, we can safely use a boolean value for the + /// APO. It is set to 'true' if 'V' is attached to an inverse operation + /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise + /// (e.g., Add/Mul) + bool APO = false; + /// Helper data for the reordering function. + bool IsUsed = false; + }; + + /// During operand reordering, we are trying to select the operand at lane + /// that matches best with the operand at the neighboring lane. Our + /// selection is based on the type of value we are looking for. For example, + /// if the neighboring lane has a load, we need to look for a load that is + /// accessing a consecutive address. These strategies are summarized in the + /// 'ReorderingMode' enumerator. + enum class ReorderingMode { + Load, ///< Matching loads to consecutive memory addresses + Opcode, ///< Matching instructions based on opcode (same or alternate) + Constant, ///< Matching constants + Splat, ///< Matching the same instruction multiple times (broadcast) + Failed, ///< We failed to create a vectorizable group + }; + + using OperandDataVec = SmallVector<OperandData, 2>; + + /// A vector of operand vectors. + SmallVector<OperandDataVec, 4> OpsVec; + + const DataLayout &DL; + ScalarEvolution &SE; + const BoUpSLP &R; + + /// \returns the operand data at \p OpIdx and \p Lane. + OperandData &getData(unsigned OpIdx, unsigned Lane) { + return OpsVec[OpIdx][Lane]; + } + + /// \returns the operand data at \p OpIdx and \p Lane. Const version. + const OperandData &getData(unsigned OpIdx, unsigned Lane) const { + return OpsVec[OpIdx][Lane]; + } + + /// Clears the used flag for all entries. + void clearUsed() { + for (unsigned OpIdx = 0, NumOperands = getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) + OpsVec[OpIdx][Lane].IsUsed = false; + } + + /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. + void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { + std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); + } + + /// \param Lane lane of the operands under analysis. + /// \param OpIdx operand index in \p Lane lane we're looking the best + /// candidate for. + /// \param Idx operand index of the current candidate value. + /// \returns The additional score due to possible broadcasting of the + /// elements in the lane. It is more profitable to have power-of-2 unique + /// elements in the lane, it will be vectorized with higher probability + /// after removing duplicates. Currently the SLP vectorizer supports only + /// vectorization of the power-of-2 number of unique scalars. + int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { + Value *IdxLaneV = getData(Idx, Lane).V; + if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) + return 0; + SmallPtrSet<Value *, 4> Uniques; + for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { + if (Ln == Lane) + continue; + Value *OpIdxLnV = getData(OpIdx, Ln).V; + if (!isa<Instruction>(OpIdxLnV)) + return 0; + Uniques.insert(OpIdxLnV); + } + int UniquesCount = Uniques.size(); + int UniquesCntWithIdxLaneV = + Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; + Value *OpIdxLaneV = getData(OpIdx, Lane).V; + int UniquesCntWithOpIdxLaneV = + Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; + if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) + return 0; + return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - + UniquesCntWithOpIdxLaneV) - + (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); + } + + /// \param Lane lane of the operands under analysis. + /// \param OpIdx operand index in \p Lane lane we're looking the best + /// candidate for. + /// \param Idx operand index of the current candidate value. + /// \returns The additional score for the scalar which users are all + /// vectorized. + int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { + Value *IdxLaneV = getData(Idx, Lane).V; + Value *OpIdxLaneV = getData(OpIdx, Lane).V; + // Do not care about number of uses for vector-like instructions + // (extractelement/extractvalue with constant indices), they are extracts + // themselves and already externally used. Vectorization of such + // instructions does not add extra extractelement instruction, just may + // remove it. + if (isVectorLikeInstWithConstOps(IdxLaneV) && + isVectorLikeInstWithConstOps(OpIdxLaneV)) + return LookAheadHeuristics::ScoreAllUserVectorized; + auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); + if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) + return 0; + return R.areAllUsersVectorized(IdxLaneI, None) + ? LookAheadHeuristics::ScoreAllUserVectorized + : 0; + } + + /// Score scaling factor for fully compatible instructions but with + /// different number of external uses. Allows better selection of the + /// instructions with less external uses. + static const int ScoreScaleFactor = 10; /// \Returns the look-ahead score, which tells us how much the sub-trees /// rooted at \p LHS and \p RHS match, the more they match the higher the /// score. This helps break ties in an informed way when we cannot decide on /// the order of the operands by just considering the immediate /// predecessors. - int getLookAheadScore(const std::pair<Value *, int> &LHS, - const std::pair<Value *, int> &RHS) { - InLookAheadValues.clear(); - return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); + int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, + int Lane, unsigned OpIdx, unsigned Idx, + bool &IsUsed) { + LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(), + LookAheadMaxDepth); + // Keep track of the instruction stack as we recurse into the operands + // during the look-ahead score exploration. + int Score = + LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, + /*CurrLevel=*/1, MainAltOps); + if (Score) { + int SplatScore = getSplatScore(Lane, OpIdx, Idx); + if (Score <= -SplatScore) { + // Set the minimum score for splat-like sequence to avoid setting + // failed state. + Score = 1; + } else { + Score += SplatScore; + // Scale score to see the difference between different operands + // and similar operands but all vectorized/not all vectorized + // uses. It does not affect actual selection of the best + // compatible operand in general, just allows to select the + // operand with all vectorized uses. + Score *= ScoreScaleFactor; + Score += getExternalUseScore(Lane, OpIdx, Idx); + IsUsed = true; + } + } + return Score; } + /// Best defined scores per lanes between the passes. Used to choose the + /// best operand (with the highest score) between the passes. + /// The key - {Operand Index, Lane}. + /// The value - the best score between the passes for the lane and the + /// operand. + SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8> + BestScoresPerLanes; + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. - Optional<unsigned> - getBestOperand(unsigned OpIdx, int Lane, int LastLane, - ArrayRef<ReorderingMode> ReorderingModes) { + Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane, + ArrayRef<ReorderingMode> ReorderingModes, + ArrayRef<Value *> MainAltOps) { unsigned NumOperands = getNumOperands(); // The operand of the previous lane at OpIdx. @@ -1318,6 +1531,8 @@ public: // Our strategy mode for OpIdx. ReorderingMode RMode = ReorderingModes[OpIdx]; + if (RMode == ReorderingMode::Failed) + return None; // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; @@ -1329,7 +1544,15 @@ public: Optional<unsigned> Idx = None; unsigned Score = 0; } BestOp; - + BestOp.Score = + BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0) + .first->second; + + // Track if the operand must be marked as used. If the operand is set to + // Score 1 explicitly (because of non power-of-2 unique scalars, we may + // want to reestimate the operands again on the following iterations). + bool IsUsed = + RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant; // Iterate through all unused operands and look for the best. for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { // Get the operand at Idx and Lane. @@ -1355,11 +1578,12 @@ public: bool LeftToRight = Lane > LastLane; Value *OpLeft = (LeftToRight) ? OpLastLane : Op; Value *OpRight = (LeftToRight) ? Op : OpLastLane; - unsigned Score = - getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); - if (Score > BestOp.Score) { + int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, + OpIdx, Idx, IsUsed); + if (Score > static_cast<int>(BestOp.Score)) { BestOp.Idx = Idx; BestOp.Score = Score; + BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; } break; } @@ -1368,12 +1592,12 @@ public: BestOp.Idx = Idx; break; case ReorderingMode::Failed: - return None; + llvm_unreachable("Not expected Failed reordering mode."); } } if (BestOp.Idx) { - getData(BestOp.Idx.getValue(), Lane).IsUsed = true; + getData(*BestOp.Idx, Lane).IsUsed = IsUsed; return BestOp.Idx; } // If we could not find a good match return None. @@ -1690,6 +1914,10 @@ public: // rest of the lanes. We are visiting the nodes in a circular fashion, // using FirstLane as the center point and increasing the radius // distance. + SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands); + for (unsigned I = 0; I < NumOperands; ++I) + MainAltOps[I].push_back(getData(I, FirstLane).V); + for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { // Visit the lane on the right and then the lane on the left. for (int Direction : {+1, -1}) { @@ -1702,21 +1930,29 @@ public: // Look for a good match for each operand. for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { // Search for the operand that matches SortedOps[OpIdx][Lane-1]. - Optional<unsigned> BestIdx = - getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); + Optional<unsigned> BestIdx = getBestOperand( + OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]); // By not selecting a value, we allow the operands that follow to // select a better matching value. We will get a non-null value in // the next run of getBestOperand(). if (BestIdx) { // Swap the current operand with the one returned by // getBestOperand(). - swap(OpIdx, BestIdx.getValue(), Lane); + swap(OpIdx, *BestIdx, Lane); } else { // We failed to find a best operand, set mode to 'Failed'. ReorderingModes[OpIdx] = ReorderingMode::Failed; // Enable the second pass. StrategyFailed = true; } + // Try to get the alternate opcode and follow it during analysis. + if (MainAltOps[OpIdx].size() != 2) { + OperandData &AltOp = getData(OpIdx, Lane); + InstructionsState OpS = + getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}); + if (OpS.getOpcode() && OpS.isAltShuffle()) + MainAltOps[OpIdx].push_back(AltOp.V); + } } } } @@ -1780,15 +2016,109 @@ public: #endif }; + /// Evaluate each pair in \p Candidates and return index into \p Candidates + /// for a pair which have highest score deemed to have best chance to form + /// root of profitable tree to vectorize. Return None if no candidate scored + /// above the LookAheadHeuristics::ScoreFail. + /// \param Limit Lower limit of the cost, considered to be good enough score. + Optional<int> + findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates, + int Limit = LookAheadHeuristics::ScoreFail) { + LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2, + RootLookAheadMaxDepth); + int BestScore = Limit; + Optional<int> Index = None; + for (int I : seq<int>(0, Candidates.size())) { + int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, + Candidates[I].second, + /*U1=*/nullptr, /*U2=*/nullptr, + /*Level=*/1, None); + if (Score > BestScore) { + BestScore = Score; + Index = I; + } + } + return Index; + } + /// Checks if the instruction is marked for deletion. bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } - /// Marks values operands for later deletion by replacing them with Undefs. - void eraseInstructions(ArrayRef<Value *> AV); + /// Removes an instruction from its block and eventually deletes it. + /// It's like Instruction::eraseFromParent() except that the actual deletion + /// is delayed until BoUpSLP is destructed. + void eraseInstruction(Instruction *I) { + DeletedInstructions.insert(I); + } + + /// Checks if the instruction was already analyzed for being possible + /// reduction root. + bool isAnalyzedReductionRoot(Instruction *I) const { + return AnalyzedReductionsRoots.count(I); + } + /// Register given instruction as already analyzed for being possible + /// reduction root. + void analyzedReductionRoot(Instruction *I) { + AnalyzedReductionsRoots.insert(I); + } + /// Checks if the provided list of reduced values was checked already for + /// vectorization. + bool areAnalyzedReductionVals(ArrayRef<Value *> VL) { + return AnalyzedReductionVals.contains(hash_value(VL)); + } + /// Adds the list of reduced values to list of already checked values for the + /// vectorization. + void analyzedReductionVals(ArrayRef<Value *> VL) { + AnalyzedReductionVals.insert(hash_value(VL)); + } + /// Clear the list of the analyzed reduction root instructions. + void clearReductionData() { + AnalyzedReductionsRoots.clear(); + AnalyzedReductionVals.clear(); + } + /// Checks if the given value is gathered in one of the nodes. + bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const { + return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); + } ~BoUpSLP(); private: + /// Check if the operands on the edges \p Edges of the \p UserTE allows + /// reordering (i.e. the operands can be reordered because they have only one + /// user and reordarable). + /// \param ReorderableGathers List of all gather nodes that require reordering + /// (e.g., gather of extractlements or partially vectorizable loads). + /// \param GatherOps List of gather operand nodes for \p UserTE that require + /// reordering, subset of \p NonVectorized. + bool + canReorderOperands(TreeEntry *UserTE, + SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, + ArrayRef<TreeEntry *> ReorderableGathers, + SmallVectorImpl<TreeEntry *> &GatherOps); + + /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, + /// if any. If it is not vectorized (gather node), returns nullptr. + TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { + ArrayRef<Value *> VL = UserTE->getOperand(OpIdx); + TreeEntry *TE = nullptr; + const auto *It = find_if(VL, [this, &TE](Value *V) { + TE = getTreeEntry(V); + return TE; + }); + if (It != VL.end() && TE->isSame(VL)) + return TE; + return nullptr; + } + + /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, + /// if any. If it is not vectorized (gather node), returns nullptr. + const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, + unsigned OpIdx) const { + return const_cast<BoUpSLP *>(this)->getVectorizedOperand( + const_cast<TreeEntry *>(UserTE), OpIdx); + } + /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const; @@ -1815,12 +2145,17 @@ private: /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef<Value *> VL); + /// Create a new vector from a list of scalar values. Produces a sequence + /// which exploits values reused across lanes, and arranges the inserts + /// for ease of later optimization. + Value *createBuildVector(ArrayRef<Value *> VL); + /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. If \p /// NeedToShuffle is true, need to add a cost of reshuffling some of the /// vector elements. InstructionCost getGatherCost(FixedVectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices, + const APInt &ShuffledIndices, bool NeedToShuffle) const; /// Checks if the gathered \p VL can be represented as shuffle(s) of previous @@ -1855,6 +2190,29 @@ private: const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); + + /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the + /// users of \p TE and collects the stores. It returns the map from the store + /// pointers to the collected stores. + DenseMap<Value *, SmallVector<StoreInst *, 4>> + collectUserStores(const BoUpSLP::TreeEntry *TE) const; + + /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the + /// stores in \p StoresVec can for a vector instruction. If so it returns true + /// and populates \p ReorderIndices with the shuffle indices of the the stores + /// when compared to the sorted vector. + bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec, + OrdersType &ReorderIndices) const; + + /// Iterates through the users of \p TE, looking for scalar stores that can be + /// potentially vectorized in a future SLP-tree. If found, it keeps track of + /// their order and builds an order index vector for each store bundle. It + /// returns all these order vectors found. + /// We run this after the tree has formed, otherwise we may come across user + /// instructions that are not yet in the tree. + SmallVector<OrdersType, 1> + findExternalStoreUsersReorderIndices(TreeEntry *TE) const; + struct TreeEntry { using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2199,15 +2557,21 @@ private: ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. - unsigned Lane = 0; - for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; - BundleMember = BundleMember->NextInBundle) { - BundleMember->TE = Last; - BundleMember->Lane = Lane; - ++Lane; - } - assert((!Bundle.getValue() || Lane == VL.size()) && + ScheduleData *BundleMember = *Bundle; + assert((BundleMember || isa<PHINode>(S.MainOp) || + isVectorLikeInstWithConstOps(S.MainOp) || + doesNotNeedToSchedule(VL)) && "Bundle and VL out of sync"); + if (BundleMember) { + for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; + assert(BundleMember && "Unexpected end of bundle."); + BundleMember->TE = Last; + BundleMember = BundleMember->NextInBundle; + } + } + assert(!BundleMember && "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -2241,7 +2605,7 @@ private: /// Maps a specific scalar to its tree entry. SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; - /// Maps a value to the proposed vectorizable size. + /// Maps a value to the proposed vectorizable size. SmallDenseMap<Value *, unsigned> InstrElementSize; /// A list of scalars that we found that we need to keep as scalars. @@ -2272,12 +2636,12 @@ private: // First check if the result is already in the cache. AliasCacheKey key = std::make_pair(Inst1, Inst2); Optional<bool> &result = AliasCache[key]; - if (result.hasValue()) { + if (result) { return result.getValue(); } bool aliased = true; if (Loc1.Ptr && isSimple(Inst1)) - aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1)); + aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); // Store the result in the cache. result = aliased; return aliased; @@ -2289,20 +2653,23 @@ private: /// TODO: consider moving this to the AliasAnalysis itself. DenseMap<AliasCacheKey, Optional<bool>> AliasCache; - /// Removes an instruction from its block and eventually deletes it. - /// It's like Instruction::eraseFromParent() except that the actual deletion - /// is delayed until BoUpSLP is destructed. - /// This is required to ensure that there are no incorrect collisions in the - /// AliasCache, which can happen if a new instruction is allocated at the - /// same address as a previously deleted instruction. - void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { - auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; - It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; - } + // Cache for pointerMayBeCaptured calls inside AA. This is preserved + // globally through SLP because we don't perform any action which + // invalidates capture results. + BatchAAResults BatchAA; /// Temporary store for deleted instructions. Instructions will be deleted - /// eventually when the BoUpSLP is destructed. - DenseMap<Instruction *, bool> DeletedInstructions; + /// eventually when the BoUpSLP is destructed. The deferral is required to + /// ensure that there are no incorrect collisions in the AliasCache, which + /// can happen if a new instruction is allocated at the same address as a + /// previously deleted instruction. + DenseSet<Instruction *> DeletedInstructions; + + /// Set of the instruction, being analyzed already for reductions. + SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots; + + /// Set of hashes for the list of reduction values already being analyzed. + DenseSet<size_t> AnalyzedReductionVals; /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). External User @@ -2336,14 +2703,39 @@ private: NextLoadStore = nullptr; IsScheduled = false; SchedulingRegionID = BlockSchedulingRegionID; - UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); OpValue = OpVal; TE = nullptr; - Lane = -1; + } + + /// Verify basic self consistency properties + void verify() { + if (hasValidDependencies()) { + assert(UnscheduledDeps <= Dependencies && "invariant"); + } else { + assert(UnscheduledDeps == Dependencies && "invariant"); + } + + if (IsScheduled) { + assert(isSchedulingEntity() && + "unexpected scheduled state"); + for (const ScheduleData *BundleMember = this; BundleMember; + BundleMember = BundleMember->NextInBundle) { + assert(BundleMember->hasValidDependencies() && + BundleMember->UnscheduledDeps == 0 && + "unexpected scheduled state"); + assert((BundleMember == this || !BundleMember->IsScheduled) && + "only bundle is marked scheduled"); + } + } + + assert(Inst->getParent() == FirstInBundle->Inst->getParent() && + "all bundle members must be in same basic block"); } /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. bool hasValidDependencies() const { return Dependencies != InvalidDeps; } /// Returns true for single instructions and for bundle representatives @@ -2353,7 +2745,7 @@ private: /// Returns true if it represents an instruction bundle and not only a /// single instruction. bool isPartOfBundle() const { - return NextInBundle != nullptr || FirstInBundle != this; + return NextInBundle != nullptr || FirstInBundle != this || TE; } /// Returns true if it is ready for scheduling, i.e. it has no more @@ -2361,20 +2753,23 @@ private: bool isReady() const { assert(isSchedulingEntity() && "can't consider non-scheduling entity for ready list"); - return UnscheduledDepsInBundle == 0 && !IsScheduled; + return unscheduledDepsInBundle() == 0 && !IsScheduled; } - /// Modifies the number of unscheduled dependencies, also updating it for - /// the whole bundle. + /// Modifies the number of unscheduled dependencies for this instruction, + /// and returns the number of remaining dependencies for the containing + /// bundle. int incrementUnscheduledDeps(int Incr) { + assert(hasValidDependencies() && + "increment of unscheduled deps would be meaningless"); UnscheduledDeps += Incr; - return FirstInBundle->UnscheduledDepsInBundle += Incr; + return FirstInBundle->unscheduledDepsInBundle(); } /// Sets the number of unscheduled dependencies to the number of /// dependencies. void resetUnscheduledDeps() { - incrementUnscheduledDeps(Dependencies - UnscheduledDeps); + UnscheduledDeps = Dependencies; } /// Clears all dependency information. @@ -2382,6 +2777,19 @@ private: Dependencies = InvalidDeps; resetUnscheduledDeps(); MemoryDependencies.clear(); + ControlDependencies.clear(); + } + + int unscheduledDepsInBundle() const { + assert(isSchedulingEntity() && "only meaningful on the bundle"); + int Sum = 0; + for (const ScheduleData *BundleMember = this; BundleMember; + BundleMember = BundleMember->NextInBundle) { + if (BundleMember->UnscheduledDeps == InvalidDeps) + return InvalidDeps; + Sum += BundleMember->UnscheduledDeps; + } + return Sum; } void dump(raw_ostream &os) const { @@ -2402,6 +2810,12 @@ private: Instruction *Inst = nullptr; + /// Opcode of the current instruction in the schedule data. + Value *OpValue = nullptr; + + /// The TreeEntry that this instruction corresponds to. + TreeEntry *TE = nullptr; + /// Points to the head in an instruction bundle (and always to this for /// single instructions). ScheduleData *FirstInBundle = nullptr; @@ -2418,6 +2832,12 @@ private: /// This list is derived on demand in calculateDependencies(). SmallVector<ScheduleData *, 4> MemoryDependencies; + /// List of instructions which this instruction could be control dependent + /// on. Allowing such nodes to be scheduled below this one could introduce + /// a runtime fault which didn't exist in the original program. + /// ex: this is a load or udiv following a readonly call which inf loops + SmallVector<ScheduleData *, 4> ControlDependencies; + /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. int SchedulingRegionID = 0; @@ -2437,22 +2857,9 @@ private: /// Note that this is negative as long as Dependencies is not calculated. int UnscheduledDeps = InvalidDeps; - /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for - /// single instructions. - int UnscheduledDepsInBundle = InvalidDeps; - /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled = false; - - /// Opcode of the current instruction in the schedule data. - Value *OpValue = nullptr; - - /// The TreeEntry that this instruction corresponds to. - TreeEntry *TE = nullptr; - - /// The lane of this node in the TreeEntry. - int Lane = -1; }; #ifndef NDEBUG @@ -2467,6 +2874,21 @@ private: friend struct DOTGraphTraits<BoUpSLP *>; /// Contains all scheduling data for a basic block. + /// It does not schedules instructions, which are not memory read/write + /// instructions and their operands are either constants, or arguments, or + /// phis, or instructions from others blocks, or their users are phis or from + /// the other blocks. The resulting vector instructions can be placed at the + /// beginning of the basic block without scheduling (if operands does not need + /// to be scheduled) or at the end of the block (if users are outside of the + /// block). It allows to save some compile time and memory used by the + /// compiler. + /// ScheduleData is assigned for each instruction in between the boundaries of + /// the tree entry, even for those, which are not part of the graph. It is + /// required to correctly follow the dependencies between the instructions and + /// their correct scheduling. The ScheduleData is not allocated for the + /// instructions, which do not require scheduling, like phis, nodes with + /// extractelements/insertelements only or nodes with instructions, with + /// uses/operands outside of the block. struct BlockScheduling { BlockScheduling(BasicBlock *BB) : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} @@ -2477,6 +2899,7 @@ private: ScheduleEnd = nullptr; FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + RegionHasStackSave = false; // Reduce the maximum schedule region size by the size of the // previous scheduling run. @@ -2490,20 +2913,29 @@ private: ++SchedulingRegionID; } - ScheduleData *getScheduleData(Value *V) { - ScheduleData *SD = ScheduleDataMap[V]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) + ScheduleData *getScheduleData(Instruction *I) { + if (BB != I->getParent()) + // Avoid lookup if can't possibly be in map. + return nullptr; + ScheduleData *SD = ScheduleDataMap.lookup(I); + if (SD && isInSchedulingRegion(SD)) return SD; return nullptr; } + ScheduleData *getScheduleData(Value *V) { + if (auto *I = dyn_cast<Instruction>(V)) + return getScheduleData(I); + return nullptr; + } + ScheduleData *getScheduleData(Value *V, Value *Key) { if (V == Key) return getScheduleData(V); auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second[Key]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) + ScheduleData *SD = I->second.lookup(Key); + if (SD && isInSchedulingRegion(SD)) return SD; } return nullptr; @@ -2524,7 +2956,7 @@ private: BundleMember = BundleMember->NextInBundle) { if (BundleMember->Inst != BundleMember->OpValue) continue; - + // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. @@ -2546,10 +2978,12 @@ private: }; // If BundleMember is a vector bundle, its operands may have been - // reordered duiring buildTree(). We therefore need to get its operands + // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. if (TreeEntry *TE = BundleMember->TE) { - int Lane = BundleMember->Lane; + // Need to search for the lane since the tree entry can be reordered. + int Lane = std::distance(TE->Scalars.begin(), + find(TE->Scalars, BundleMember->Inst)); assert(Lane >= 0 && "Lane not set"); // Since vectorization tree is being built recursively this assertion @@ -2558,7 +2992,7 @@ private: // where their second (immediate) operand is not added. Since // immediates do not affect scheduler behavior this is considered // okay. - auto *In = TE->getMainOp(); + auto *In = BundleMember->Inst; assert(In && (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && @@ -2578,7 +3012,8 @@ private: } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { - if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { + if (MemoryDepSD->hasValidDependencies() && + MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; @@ -2589,6 +3024,48 @@ private: << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } + // Handle the control dependencies. + for (ScheduleData *DepSD : BundleMember->ControlDependencies) { + if (DepSD->incrementUnscheduledDeps(-1) == 0) { + // There are no more unscheduled dependencies after decrementing, + // so we can put the dependent instruction into the ready list. + ScheduleData *DepBundle = DepSD->FirstInBundle; + assert(!DepBundle->IsScheduled && + "already scheduled bundle gets ready"); + ReadyList.insert(DepBundle); + LLVM_DEBUG(dbgs() + << "SLP: gets ready (ctl): " << *DepBundle << "\n"); + } + } + + } + } + + /// Verify basic self consistency properties of the data structure. + void verify() { + if (!ScheduleStart) + return; + + assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && + ScheduleStart->comesBefore(ScheduleEnd) && + "Not a valid scheduling region?"); + + for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { + auto *SD = getScheduleData(I); + if (!SD) + continue; + assert(isInSchedulingRegion(SD) && + "primary schedule data not in window?"); + assert(isInSchedulingRegion(SD->FirstInBundle) && + "entire bundle in window!"); + (void)SD; + doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); }); + } + + for (auto *SD : ReadyInsts) { + assert(SD->isSchedulingEntity() && SD->isReady() && + "item in ready list not ready?"); + (void)SD; } } @@ -2599,7 +3076,7 @@ private: auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) + if (isInSchedulingRegion(P.second)) Action(P.second); } @@ -2608,10 +3085,11 @@ private: void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { doForAllOpcodes(I, [&](ScheduleData *SD) { - if (SD->isSchedulingEntity() && SD->isReady()) { + if (SD->isSchedulingEntity() && SD->hasValidDependencies() && + SD->isReady()) { ReadyList.insert(SD); LLVM_DEBUG(dbgs() - << "SLP: initially in ready list: " << *I << "\n"); + << "SLP: initially in ready list: " << *SD << "\n"); } }); } @@ -2669,18 +3147,14 @@ private: /// Attaches ScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. /// ScheduleData structures are recycled. - DenseMap<Value *, ScheduleData *> ScheduleDataMap; + DenseMap<Instruction *, ScheduleData *> ScheduleDataMap; /// Attaches ScheduleData to Instruction with the leading key. DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> ExtraScheduleDataMap; - struct ReadyList : SmallVector<ScheduleData *, 8> { - void insert(ScheduleData *SD) { push_back(SD); } - }; - /// The ready-list for scheduling (only used for the dry-run). - ReadyList ReadyInsts; + SetVector<ScheduleData *> ReadyInsts; /// The first instruction of the scheduling region. Instruction *ScheduleStart = nullptr; @@ -2696,6 +3170,11 @@ private: /// (can be null). ScheduleData *LastLoadStoreInRegion = nullptr; + /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling + /// region? Used to optimize the dependence calculation for the + /// common case where there isn't. + bool RegionHasStackSave = false; + /// The current size of the scheduling region. int ScheduleRegionSize = 0; @@ -2704,8 +3183,8 @@ private: /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. - // Make sure that the initial SchedulingRegionID is greater than the - // initial SchedulingRegionID in ScheduleData (which is 0). + /// Make sure that the initial SchedulingRegionID is greater than the + /// initial SchedulingRegionID in ScheduleData (which is 0). int SchedulingRegionID = 1; }; @@ -2717,7 +3196,7 @@ private: void scheduleBlock(BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. - ArrayRef<Value *> UserIgnoreList; + const SmallDenseSet<Value *> *UserIgnoreList = nullptr; /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of /// sorted SmallVectors of unsigned. @@ -2748,7 +3227,6 @@ private: ScalarEvolution *SE; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; - AAResults *AA; LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; @@ -2865,20 +3343,25 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { } // end namespace llvm BoUpSLP::~BoUpSLP() { - for (const auto &Pair : DeletedInstructions) { - // Replace operands of ignored instructions with Undefs in case if they were - // marked for deletion. - if (Pair.getSecond()) { - Value *Undef = UndefValue::get(Pair.getFirst()->getType()); - Pair.getFirst()->replaceAllUsesWith(Undef); - } - Pair.getFirst()->dropAllReferences(); - } - for (const auto &Pair : DeletedInstructions) { - assert(Pair.getFirst()->use_empty() && + SmallVector<WeakTrackingVH> DeadInsts; + for (auto *I : DeletedInstructions) { + for (Use &U : I->operands()) { + auto *Op = dyn_cast<Instruction>(U.get()); + if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() && + wouldInstructionBeTriviallyDead(Op, TLI)) + DeadInsts.emplace_back(Op); + } + I->dropAllReferences(); + } + for (auto *I : DeletedInstructions) { + assert(I->use_empty() && "trying to erase instruction with users."); - Pair.getFirst()->eraseFromParent(); + I->eraseFromParent(); } + + // Cleanup any dead scalar code feeding the vectorized instructions + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); + #ifdef EXPENSIVE_CHECKS // If we could guarantee that this call is not extremely slow, we could // remove the ifdef limitation (see PR47712). @@ -2886,13 +3369,6 @@ BoUpSLP::~BoUpSLP() { #endif } -void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { - for (auto *V : AV) { - if (auto *I = dyn_cast<Instruction>(V)) - eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); - }; -} - /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses /// contains original mask for the scalars reused in the node. Procedure /// transform this mask in accordance with the given \p Mask. @@ -2997,6 +3473,189 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { return None; } +namespace { +/// Tracks the state we can represent the loads in the given sequence. +enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +} // anonymous namespace + +/// Checks if the given array of loads can be represented as a vectorized, +/// scatter or just simple gather. +static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, + const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, + LoopInfo &LI, + SmallVectorImpl<unsigned> &Order, + SmallVectorImpl<Value *> &PointerOps) { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + Type *ScalarTy = VL0->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) + return LoadsState::Gather; + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. + PointerOps.clear(); + PointerOps.resize(VL.size()); + auto *POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast<LoadInst>(V); + if (!L->isSimple()) + return LoadsState::Gather; + *POIter = L->getPointerOperand(); + ++POIter; + } + + Order.clear(); + // Check the order of pointer operands or that all pointers are the same. + bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); + if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) { + if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front())) + return false; + auto *GEP = dyn_cast<GetElementPtrInst>(P); + if (!GEP) + return false; + auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front()); + return GEP->getNumOperands() == 2 && + ((isConstant(GEP->getOperand(1)) && + isConstant(GEP0->getOperand(1))) || + getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)}) + .getOpcode()); + })) { + if (IsSorted) { + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + Optional<int> Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + // Check that the sorted loads are consecutive. + if (static_cast<unsigned>(*Diff) == VL.size() - 1) + return LoadsState::Vectorize; + } + // TODO: need to improve analysis of the pointers, if not all of them are + // GEPs or have > 2 operands, we end up with a gather node, which just + // increases the cost. + Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent()); + bool ProfitableGatherPointers = + static_cast<unsigned>(count_if(PointerOps, [L](Value *V) { + return L && L->isLoopInvariant(V); + })) <= VL.size() / 2 && VL.size() > 2; + if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { + auto *GEP = dyn_cast<GetElementPtrInst>(P); + return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || + (GEP && GEP->getNumOperands() == 2); + })) { + Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && + !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::ScatterVectorize; + } + } + + return LoadsState::Gather; +} + +bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl<unsigned> &SortedIndices) { + assert(llvm::all_of( + VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); + // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each + // Ptr into, sort and return the sorted indices with values next to one + // another. + MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases; + Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); + + unsigned Cnt = 1; + for (Value *Ptr : VL.drop_front()) { + bool Found = any_of(Bases, [&](auto &Base) { + Optional<int> Diff = + getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + return false; + + Base.second.emplace_back(Ptr, *Diff, Cnt++); + return true; + }); + + if (!Found) { + // If we haven't found enough to usefully cluster, return early. + if (Bases.size() > VL.size() / 2 - 1) + return false; + + // Not found already - add a new Base + Bases[Ptr].emplace_back(Ptr, 0, Cnt++); + } + } + + // For each of the bases sort the pointers by Offset and check if any of the + // base become consecutively allocated. + bool AnyConsecutive = false; + for (auto &Base : Bases) { + auto &Vec = Base.second; + if (Vec.size() > 1) { + llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X, + const std::tuple<Value *, int, unsigned> &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int InitialOffset = std::get<1>(Vec[0]); + AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) { + return std::get<1>(P.value()) == int(P.index()) + InitialOffset; + }); + } + } + + // Fill SortedIndices array only if it looks worth-while to sort the ptrs. + SortedIndices.clear(); + if (!AnyConsecutive) + return false; + + for (auto &Base : Bases) { + for (auto &T : Base.second) + SortedIndices.push_back(std::get<2>(T)); + } + + assert(SortedIndices.size() == VL.size() && + "Expected SortedIndices to be the size of VL"); + return true; +} + +Optional<BoUpSLP::OrdersType> +BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { + assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); + Type *ScalarTy = TE.Scalars[0]->getType(); + + SmallVector<Value *> Ptrs; + Ptrs.reserve(TE.Scalars.size()); + for (Value *V : TE.Scalars) { + auto *L = dyn_cast<LoadInst>(V); + if (!L || !L->isSimple()) + return None; + Ptrs.push_back(L->getPointerOperand()); + } + + BoUpSLP::OrdersType Order; + if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) + return Order; + return None; +} + Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -3037,6 +3696,9 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, } if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; + if (TE.Scalars.size() >= 4) + if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) + return Order; } return None; } @@ -3047,13 +3709,55 @@ void BoUpSLP::reorderTopToBottom() { // ExtractElement gather nodes which can be vectorized and need to handle // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; + + // AltShuffles can also have a preferred ordering that leads to fewer + // instructions, e.g., the addsub instruction in x86. + DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders; + + // Maps a TreeEntry to the reorder indices of external users. + DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>> + ExternalUserReorderMap; + // FIXME: Workaround for syntax error reported by MSVC buildbots. + TargetTransformInfo &TTIRef = *TTI; // Find all reorderable nodes with the given VF. // Currently the are vectorized stores,loads,extracts + some gathering of // extracts. - for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( + for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries, + &GathersToOrders, &ExternalUserReorderMap, + &AltShufflesToOrders]( const std::unique_ptr<TreeEntry> &TE) { + // Look for external users that will probably be vectorized. + SmallVector<OrdersType, 1> ExternalUserReorderIndices = + findExternalStoreUsersReorderIndices(TE.get()); + if (!ExternalUserReorderIndices.empty()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + ExternalUserReorderMap.try_emplace(TE.get(), + std::move(ExternalUserReorderIndices)); + } + + // Patterns like [fadd,fsub] can be combined into a single instruction in + // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need + // to take into account their order when looking for the most used order. + if (TE->isAltShuffle()) { + VectorType *VecTy = + FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); + unsigned Opcode0 = TE->getOpcode(); + unsigned Opcode1 = TE->getAltOpcode(); + // The opcode mask selects between the two opcodes. + SmallBitVector OpcodeMask(TE->Scalars.size(), 0); + for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) + if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1) + OpcodeMask.set(Lane); + // If this pattern is supported by the target then we consider the order. + if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); + } + // TODO: Check the reverse order too. + } + if (Optional<OrdersType> CurrentOrder = - getReorderingData(*TE.get(), /*TopToBottom=*/true)) { + getReorderingData(*TE, /*TopToBottom=*/true)) { // Do not include ordering for nodes used in the alt opcode vectorization, // better to reorder them during bottom-to-top stage. If follow the order // here, it causes reordering of the whole graph though actually it is @@ -3071,10 +3775,7 @@ void BoUpSLP::reorderTopToBottom() { EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; })) return; - if (UserTE->UserTreeIndices.empty()) - UserTE = nullptr; - else - UserTE = UserTE->UserTreeIndices.back().UserTE; + UserTE = UserTE->UserTreeIndices.back().UserTE; ++Cnt; } VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); @@ -3105,11 +3806,30 @@ void BoUpSLP::reorderTopToBottom() { if (!OpTE->ReuseShuffleIndices.empty()) continue; // Count number of orders uses. - const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) - return GathersToOrders.find(OpTE)->second; + const auto &Order = [OpTE, &GathersToOrders, + &AltShufflesToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) { + auto It = GathersToOrders.find(OpTE); + if (It != GathersToOrders.end()) + return It->second; + } + if (OpTE->isAltShuffle()) { + auto It = AltShufflesToOrders.find(OpTE); + if (It != AltShufflesToOrders.end()) + return It->second; + } return OpTE->ReorderIndices; }(); + // First consider the order of the external scalar users. + auto It = ExternalUserReorderMap.find(OpTE); + if (It != ExternalUserReorderMap.end()) { + const auto &ExternalUserReorderIndices = It->second; + for (const OrdersType &ExtOrder : ExternalUserReorderIndices) + ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + // No other useful reorder data in this entry. + if (Order.empty()) + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -3199,6 +3919,57 @@ void BoUpSLP::reorderTopToBottom() { } } +bool BoUpSLP::canReorderOperands( + TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, + ArrayRef<TreeEntry *> ReorderableGathers, + SmallVectorImpl<TreeEntry *> &GatherOps) { + for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { + if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) { + return OpData.first == I && + OpData.second->State == TreeEntry::Vectorize; + })) + continue; + if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { + // Do not reorder if operand node is used by many user nodes. + if (any_of(TE->UserTreeIndices, + [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) + return false; + // Add the node to the list of the ordered nodes with the identity + // order. + Edges.emplace_back(I, TE); + // Add ScatterVectorize nodes to the list of operands, where just + // reordering of the scalars is required. Similar to the gathers, so + // simply add to the list of gathered ops. + // If there are reused scalars, process this node as a regular vectorize + // node, just reorder reuses mask. + if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty()) + GatherOps.push_back(TE); + continue; + } + TreeEntry *Gather = nullptr; + if (count_if(ReorderableGathers, + [&Gather, UserTE, I](TreeEntry *TE) { + assert(TE->State != TreeEntry::Vectorize && + "Only non-vectorized nodes are expected."); + if (any_of(TE->UserTreeIndices, + [UserTE, I](const EdgeInfo &EI) { + return EI.UserTE == UserTE && EI.EdgeIdx == I; + })) { + assert(TE->isSame(UserTE->getOperand(I)) && + "Operand entry does not match operands."); + Gather = TE; + return true; + } + return false; + }) > 1 && + !all_of(UserTE->getOperand(I), isConstant)) + return false; + if (Gather) + GatherOps.push_back(Gather); + } + return true; +} + void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SetVector<TreeEntry *> OrderedEntries; DenseMap<const TreeEntry *, OrdersType> GathersToOrders; @@ -3212,49 +3983,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); if (Optional<OrdersType> CurrentOrder = - getReorderingData(*TE.get(), /*TopToBottom=*/false)) { + getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); - // Checks if the operands of the users are reordarable and have only single - // use. - auto &&CheckOperands = - [this, &NonVectorized](const auto &Data, - SmallVectorImpl<TreeEntry *> &GatherOps) { - for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { - if (any_of(Data.second, - [I](const std::pair<unsigned, TreeEntry *> &OpData) { - return OpData.first == I && - OpData.second->State == TreeEntry::Vectorize; - })) - continue; - ArrayRef<Value *> VL = Data.first->getOperand(I); - const TreeEntry *TE = nullptr; - const auto *It = find_if(VL, [this, &TE](Value *V) { - TE = getTreeEntry(V); - return TE; - }); - if (It != VL.end() && TE->isSame(VL)) - return false; - TreeEntry *Gather = nullptr; - if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { - assert(TE->State != TreeEntry::Vectorize && - "Only non-vectorized nodes are expected."); - if (TE->isSame(VL)) { - Gather = TE; - return true; - } - return false; - }) > 1) - return false; - if (Gather) - GatherOps.push_back(Gather); - } - return true; - }; // 1. Propagate order to the graph nodes, which use only reordered nodes. // I.e., if the node has operands, that are reordered, try to make at least // one operand order in the natural order and reorder others + reorder the @@ -3263,7 +3998,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { while (!OrderedEntries.empty()) { // 1. Filter out only reordered nodes. // 2. If the entry has multiple uses - skip it and jump to the next node. - MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; + DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; SmallVector<TreeEntry *> Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || @@ -3291,10 +4026,17 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Erase filtered entries. for_each(Filtered, [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); - for (const auto &Data : Users) { + SmallVector< + std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>> + UsersVec(Users.begin(), Users.end()); + sort(UsersVec, [](const auto &Data1, const auto &Data2) { + return Data1.first->Idx > Data2.first->Idx; + }); + for (auto &Data : UsersVec) { // Check that operands are used only in the User node. SmallVector<TreeEntry *> GatherOps; - if (!CheckOperands(Data, GatherOps)) { + if (!canReorderOperands(Data.first, Data.second, NonVectorized, + GatherOps)) { for_each(Data.second, [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { OrderedEntries.remove(Op.second); @@ -3310,18 +4052,22 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // the same node my be considered several times, though might be not // profitable. SmallPtrSet<const TreeEntry *, 4> VisitedOps; + SmallPtrSet<const TreeEntry *, 4> VisitedUsers; for (const auto &Op : Data.second) { TreeEntry *OpTE = Op.second; if (!VisitedOps.insert(OpTE).second) continue; - if (!OpTE->ReuseShuffleIndices.empty() || - (IgnoreReorder && OpTE == VectorizableTree.front().get())) + if (!OpTE->ReuseShuffleIndices.empty()) continue; const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { if (OpTE->State == TreeEntry::NeedToGather) return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); + unsigned NumOps = count_if( + Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { + return P.second == OpTE; + }); // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -3333,14 +4079,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += + NumOps; } else { - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; + } + auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); + const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( + const TreeEntry *TE) { + if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || + (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || + (IgnoreReorder && TE->Idx == 0)) + return true; + if (TE->State == TreeEntry::NeedToGather) { + auto It = GathersToOrders.find(TE); + if (It != GathersToOrders.end()) + return !It->second.empty(); + return true; + } + return false; + }; + for (const EdgeInfo &EI : OpTE->UserTreeIndices) { + TreeEntry *UserTE = EI.UserTE; + if (!VisitedUsers.insert(UserTE).second) + continue; + // May reorder user node if it requires reordering, has reused + // scalars, is an alternate op vectorize node or its op nodes require + // reordering. + if (AllowsReordering(UserTE)) + continue; + // Check if users allow reordering. + // Currently look up just 1 level of operands to avoid increase of + // the compile time. + // Profitable to reorder if definitely more operands allow + // reordering rather than those with natural order. + ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE]; + if (static_cast<unsigned>(count_if( + Ops, [UserTE, &AllowsReordering]( + const std::pair<unsigned, TreeEntry *> &Op) { + return AllowsReordering(Op.second) && + all_of(Op.second->UserTreeIndices, + [UserTE](const EdgeInfo &EI) { + return EI.UserTE == UserTE; + }); + })) <= Ops.size() / 2) + ++Res.first->second; } - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += - OpTE->UserTreeIndices.size(); - assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0."); - --OrdersUses[{}]; } // If no orders - skip current nodes and jump to the next one, if any. if (OrdersUses.empty()) { @@ -3381,7 +4165,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { OrderedEntries.remove(TE); if (!VisitedOps.insert(TE).second) continue; - if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) { + if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { // Just reorder reuses indices. reorderReuses(TE->ReuseShuffleIndices, Mask); continue; @@ -3393,6 +4177,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."); reorderOrder(TE->ReorderIndices, Mask); + if (IgnoreReorder && TE == VectorizableTree.front().get()) + IgnoreReorder = false; } // For gathers just need to reorder its scalars. for (TreeEntry *Gather : GatherOps) { @@ -3484,7 +4270,7 @@ void BoUpSLP::buildExternalUses( } // Ignore users in the user ignore list. - if (is_contained(UserIgnoreList, UserInst)) + if (UserIgnoreList && UserIgnoreList->contains(UserInst)) continue; LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " @@ -3495,78 +4281,270 @@ void BoUpSLP::buildExternalUses( } } +DenseMap<Value *, SmallVector<StoreInst *, 4>> +BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { + DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap; + for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) { + Value *V = TE->Scalars[Lane]; + // To save compilation time we don't visit if we have too many users. + static constexpr unsigned UsersLimit = 4; + if (V->hasNUsesOrMore(UsersLimit)) + break; + + // Collect stores per pointer object. + for (User *U : V->users()) { + auto *SI = dyn_cast<StoreInst>(U); + if (SI == nullptr || !SI->isSimple() || + !isValidElementType(SI->getValueOperand()->getType())) + continue; + // Skip entry if already + if (getTreeEntry(U)) + continue; + + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + auto &StoresVec = PtrToStoresMap[Ptr]; + // For now just keep one store per pointer object per lane. + // TODO: Extend this to support multiple stores per pointer per lane + if (StoresVec.size() > Lane) + continue; + // Skip if in different BBs. + if (!StoresVec.empty() && + SI->getParent() != StoresVec.back()->getParent()) + continue; + // Make sure that the stores are of the same type. + if (!StoresVec.empty() && + SI->getValueOperand()->getType() != + StoresVec.back()->getValueOperand()->getType()) + continue; + StoresVec.push_back(SI); + } + } + return PtrToStoresMap; +} + +bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec, + OrdersType &ReorderIndices) const { + // We check whether the stores in StoreVec can form a vector by sorting them + // and checking whether they are consecutive. + + // To avoid calling getPointersDiff() while sorting we create a vector of + // pairs {store, offset from first} and sort this instead. + SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size()); + StoreInst *S0 = StoresVec[0]; + StoreOffsetVec[0] = {S0, 0}; + Type *S0Ty = S0->getValueOperand()->getType(); + Value *S0Ptr = S0->getPointerOperand(); + for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) { + StoreInst *SI = StoresVec[Idx]; + Optional<int> Diff = + getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), + SI->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + // We failed to compare the pointers so just abandon this StoresVec. + if (!Diff) + return false; + StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff}; + } + + // Sort the vector based on the pointers. We create a copy because we may + // need the original later for calculating the reorder (shuffle) indices. + stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1, + const std::pair<StoreInst *, int> &Pair2) { + int Offset1 = Pair1.second; + int Offset2 = Pair2.second; + return Offset1 < Offset2; + }); + + // Check if the stores are consecutive by checking if their difference is 1. + for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size())) + if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1) + return false; + + // Calculate the shuffle indices according to their offset against the sorted + // StoreOffsetVec. + ReorderIndices.reserve(StoresVec.size()); + for (StoreInst *SI : StoresVec) { + unsigned Idx = find_if(StoreOffsetVec, + [SI](const std::pair<StoreInst *, int> &Pair) { + return Pair.first == SI; + }) - + StoreOffsetVec.begin(); + ReorderIndices.push_back(Idx); + } + // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in + // reorderTopToBottom() and reorderBottomToTop(), so we are following the + // same convention here. + auto IsIdentityOrder = [](const OrdersType &Order) { + for (unsigned Idx : seq<unsigned>(0, Order.size())) + if (Idx != Order[Idx]) + return false; + return true; + }; + if (IsIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); + + return true; +} + +#ifndef NDEBUG +LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) { + for (unsigned Idx : Order) + dbgs() << Idx << ", "; + dbgs() << "\n"; +} +#endif + +SmallVector<BoUpSLP::OrdersType, 1> +BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { + unsigned NumLanes = TE->Scalars.size(); + + DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap = + collectUserStores(TE); + + // Holds the reorder indices for each candidate store vector that is a user of + // the current TreeEntry. + SmallVector<OrdersType, 1> ExternalReorderIndices; + + // Now inspect the stores collected per pointer and look for vectorization + // candidates. For each candidate calculate the reorder index vector and push + // it into `ExternalReorderIndices` + for (const auto &Pair : PtrToStoresMap) { + auto &StoresVec = Pair.second; + // If we have fewer than NumLanes stores, then we can't form a vector. + if (StoresVec.size() != NumLanes) + continue; + + // If the stores are not consecutive then abandon this StoresVec. + OrdersType ReorderIndices; + if (!CanFormVector(StoresVec, ReorderIndices)) + continue; + + // We now know that the scalars in StoresVec can form a vector instruction, + // so set the reorder indices. + ExternalReorderIndices.push_back(ReorderIndices); + } + return ExternalReorderIndices; +} + void BoUpSLP::buildTree(ArrayRef<Value *> Roots, - ArrayRef<Value *> UserIgnoreLst) { + const SmallDenseSet<Value *> &UserIgnoreLst) { deleteTree(); - UserIgnoreList = UserIgnoreLst; + UserIgnoreList = &UserIgnoreLst; if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); } -namespace { -/// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { Gather, Vectorize, ScatterVectorize }; -} // anonymous namespace - -/// Checks if the given array of loads can be represented as a vectorized, -/// scatter or just simple gather. -static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, - const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl<unsigned> &Order, - SmallVectorImpl<Value *> &PointerOps) { - // Check that a vectorized load would load the same memory as a scalar - // load. For example, we don't want to vectorize loads that are smaller - // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM - // treats loading/storing it as an i8 struct. If we vectorize loads/stores - // from such a struct, we read/write packed bits disagreeing with the - // unvectorized version. - Type *ScalarTy = VL0->getType(); - - if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) - return LoadsState::Gather; +void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { + deleteTree(); + if (!allSameType(Roots)) + return; + buildTree_rec(Roots, 0, EdgeInfo()); +} - // Make sure all loads in the bundle are simple - we can't vectorize - // atomic or volatile loads. - PointerOps.clear(); - PointerOps.resize(VL.size()); - auto *POIter = PointerOps.begin(); +/// \return true if the specified list of values has only one instruction that +/// requires scheduling, false otherwise. +#ifndef NDEBUG +static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) { + Value *NeedsScheduling = nullptr; for (Value *V : VL) { - auto *L = cast<LoadInst>(V); - if (!L->isSimple()) - return LoadsState::Gather; - *POIter = L->getPointerOperand(); - ++POIter; + if (doesNotNeedToBeScheduled(V)) + continue; + if (!NeedsScheduling) { + NeedsScheduling = V; + continue; + } + return false; } + return NeedsScheduling; +} +#endif - Order.clear(); - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); +/// Generates key/subkey pair for the given value to provide effective sorting +/// of the values and better detection of the vectorizable values sequences. The +/// keys/subkeys can be used for better sorting of the values themselves (keys) +/// and in values subgroups (subkeys). +static std::pair<size_t, size_t> generateKeySubkey( + Value *V, const TargetLibraryInfo *TLI, + function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, + bool AllowAlternate) { + hash_code Key = hash_value(V->getValueID() + 2); + hash_code SubKey = hash_value(0); + // Sort the loads by the distance between the pointers. + if (auto *LI = dyn_cast<LoadInst>(V)) { + Key = hash_combine(hash_value(Instruction::Load), Key); + if (LI->isSimple()) + SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); + else + SubKey = hash_value(LI); + } else if (isVectorLikeInstWithConstOps(V)) { + // Sort extracts by the vector operands. + if (isa<ExtractElementInst, UndefValue>(V)) + Key = hash_value(Value::UndefValueVal + 1); + if (auto *EI = dyn_cast<ExtractElementInst>(V)) { + if (!isUndefVector(EI->getVectorOperand()) && + !isa<UndefValue>(EI->getIndexOperand())) + SubKey = hash_value(EI->getVectorOperand()); + } + } else if (auto *I = dyn_cast<Instruction>(V)) { + // Sort other instructions just by the opcodes except for CMPInst. + // For CMP also sort by the predicate kind. + if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) && + isValidForAlternation(I->getOpcode())) { + if (AllowAlternate) + Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0); + else + Key = hash_combine(hash_value(I->getOpcode()), Key); + SubKey = hash_combine( + hash_value(I->getOpcode()), hash_value(I->getType()), + hash_value(isa<BinaryOperator>(I) + ? I->getType() + : cast<CastInst>(I)->getOperand(0)->getType())); + // For casts, look through the only operand to improve compile time. + if (isa<CastInst>(I)) { + std::pair<size_t, size_t> OpVals = + generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator, + /*=AllowAlternate*/ true); + Key = hash_combine(OpVals.first, Key); + SubKey = hash_combine(OpVals.first, SubKey); + } + } else if (auto *CI = dyn_cast<CmpInst>(I)) { + CmpInst::Predicate Pred = CI->getPredicate(); + if (CI->isCommutative()) + Pred = std::min(Pred, CmpInst::getInversePredicate(Pred)); + CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred); + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred), + hash_value(SwapPred), + hash_value(CI->getOperand(0)->getType())); + } else if (auto *Call = dyn_cast<CallInst>(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI); + if (isTriviallyVectorizable(ID)) { + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID)); + } else if (!VFDatabase(*Call).getMappings(*Call).empty()) { + SubKey = hash_combine(hash_value(I->getOpcode()), + hash_value(Call->getCalledFunction())); + } else { + Key = hash_combine(hash_value(Call), Key); + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call)); + } + for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) + SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End), + hash_value(Op.Tag), SubKey); + } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { + if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1))) + SubKey = hash_value(Gep->getPointerOperand()); + else + SubKey = hash_value(Gep); + } else if (BinaryOperator::isIntDivRem(I->getOpcode()) && + !isa<ConstantInt>(I->getOperand(1))) { + // Do not try to vectorize instructions with potentially high cost. + SubKey = hash_value(I); } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; + SubKey = hash_value(I->getOpcode()); } - Optional<int> Diff = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); - // Check that the sorted loads are consecutive. - if (static_cast<unsigned>(*Diff) == VL.size() - 1) - return LoadsState::Vectorize; - Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); - if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) - return LoadsState::ScatterVectorize; + Key = hash_combine(hash_value(I->getParent()), Key); } - - return LoadsState::Gather; + return std::make_pair(Key, SubKey); } void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, @@ -3651,10 +4629,84 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || - (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) && - !all_of(VL, isVectorLikeInstWithConstOps))) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + // If alternate op node with 2 elements with gathered operands - do not + // vectorize. + auto &&NotProfitableForVectorization = [&S, this, + Depth](ArrayRef<Value *> VL) { + if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) + return false; + if (VectorizableTree.size() < MinTreeSize) + return false; + if (Depth >= RecursionMaxDepth - 1) + return true; + // Check if all operands are extracts, part of vector node or can build a + // regular vectorize node. + SmallVector<unsigned, 2> InstsCount(VL.size(), 0); + for (Value *V : VL) { + auto *I = cast<Instruction>(V); + InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) { + return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op); + })); + } + bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp); + if ((IsCommutative && + std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) || + (!IsCommutative && + all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; }))) + return true; + assert(VL.size() == 2 && "Expected only 2 alternate op instructions."); + SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; + auto *I1 = cast<Instruction>(VL.front()); + auto *I2 = cast<Instruction>(VL.back()); + for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) + Candidates.emplace_back().emplace_back(I1->getOperand(Op), + I2->getOperand(Op)); + if (static_cast<unsigned>(count_if( + Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { + return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); + })) >= S.MainOp->getNumOperands() / 2) + return false; + if (S.MainOp->getNumOperands() > 2) + return true; + if (IsCommutative) { + // Check permuted operands. + Candidates.clear(); + for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) + Candidates.emplace_back().emplace_back(I1->getOperand(Op), + I2->getOperand((Op + 1) % E)); + if (any_of( + Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { + return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); + })) + return false; + } + return true; + }; + SmallVector<unsigned> SortedIndices; + BasicBlock *BB = nullptr; + bool AreAllSameInsts = + (S.getOpcode() && allSameBlock(VL)) || + (S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && + VL.size() > 2 && + all_of(VL, + [&BB](Value *V) { + auto *I = dyn_cast<GetElementPtrInst>(V); + if (!I) + return doesNotNeedToBeScheduled(V); + if (!BB) + BB = I->getParent(); + return BB == I->getParent() && I->getNumOperands() == 2; + }) && + BB && + sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, + SortedIndices)); + if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts || + (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( + S.OpValue) && + !all_of(VL, isVectorLikeInstWithConstOps)) || + NotProfitableForVectorization(VL)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -3665,12 +4717,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // the same block. // Don't vectorize ephemeral values. - for (Value *V : VL) { - if (EphValues.count(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is ephemeral.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; + if (!EphValues.empty()) { + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V + << ") is ephemeral.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return; + } } } @@ -3708,20 +4762,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (Value *V : VL) { - if (is_contained(UserIgnoreList, V)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; + if (UserIgnoreList && !UserIgnoreList->empty()) { + for (Value *V : VL) { + if (UserIgnoreList && UserIgnoreList->contains(V)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } } } + // Special processing for sorted pointers for ScatterVectorize node with + // constant indeces only. + if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) && + UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) { + assert(S.OpValue->getType()->isPointerTy() && + count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= + 2 && + "Expected pointers only."); + // Reset S to make it GetElementPtr kind of node. + const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }); + assert(It != VL.end() && "Expected at least one GEP."); + S = getSameOpcode(*It); + } + // Check that all of the users of the scalars that we want to vectorize are // schedulable. auto *VL0 = cast<Instruction>(S.OpValue); - BasicBlock *BB = VL0->getParent(); + BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with @@ -3739,9 +4810,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!BSRef) BSRef = std::make_unique<BlockScheduling>(BB); - BlockScheduling &BS = *BSRef.get(); + BlockScheduling &BS = *BSRef; Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); +#ifdef EXPENSIVE_CHECKS + // Make sure we didn't break any internal invariants + BS.verify(); +#endif if (!Bundle) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || @@ -3761,10 +4836,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check for terminator values (e.g. invoke). for (Value *V : VL) - for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { - Instruction *Term = dyn_cast<Instruction>( - cast<PHINode>(V)->getIncomingValueForBlock( - PH->getIncomingBlock(I))); + for (Value *Incoming : cast<PHINode>(V)->incoming_values()) { + Instruction *Term = dyn_cast<Instruction>(Incoming); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); @@ -3908,7 +4981,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, SmallVector<Value *> PointerOps; OrdersType CurrentOrder; TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder, + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: if (CurrentOrder.empty()) { @@ -4089,7 +5162,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (Value *V : VL) { - if (cast<Instruction>(V)->getNumOperands() != 2) { + auto *I = dyn_cast<GetElementPtrInst>(V); + if (!I) + continue; + if (I->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -4100,9 +5176,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // We can't combine several GEPs into one vector if they operate on // different types. - Type *Ty0 = VL0->getOperand(0)->getType(); + Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); for (Value *V : VL) { - Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType(); + auto *GEP = dyn_cast<GEPOperator>(V); + if (!GEP) + continue; + Type *CurTy = GEP->getSourceElementType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); @@ -4113,15 +5192,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } + bool IsScatterUser = + UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; // We don't combine GEPs with non-constant indexes. Type *Ty1 = VL0->getOperand(1)->getType(); for (Value *V : VL) { - auto Op = cast<Instruction>(V)->getOperand(1); - if (!isa<ConstantInt>(Op) || + auto *I = dyn_cast<GetElementPtrInst>(V); + if (!I) + continue; + auto *Op = I->getOperand(1); + if ((!IsScatterUser && !isa<ConstantInt>(Op)) || (Op->getType() != Ty1 && - Op->getType()->getScalarSizeInBits() > - DL->getIndexSizeInBits( - V->getType()->getPointerAddressSpace()))) { + ((IsScatterUser && !isa<ConstantInt>(Op)) || + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace())))) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); @@ -4136,9 +5222,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); SmallVector<ValueList, 2> Operands(2); // Prepare the operand vector for pointer operands. - for (Value *V : VL) - Operands.front().push_back( - cast<GetElementPtrInst>(V)->getPointerOperand()); + for (Value *V : VL) { + auto *GEP = dyn_cast<GetElementPtrInst>(V); + if (!GEP) { + Operands.front().push_back(V); + continue; + } + Operands.front().push_back(GEP->getPointerOperand()); + } TE->setOperand(0, Operands.front()); // Need to cast all indices to the same type before vectorization to // avoid crash. @@ -4149,9 +5240,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); Type *Ty = all_of(VL, [VL0Ty, IndexIdx](Value *V) { - return VL0Ty == cast<GetElementPtrInst>(V) - ->getOperand(IndexIdx) - ->getType(); + auto *GEP = dyn_cast<GetElementPtrInst>(V); + if (!GEP) + return true; + return VL0Ty == GEP->getOperand(IndexIdx)->getType(); }) ? VL0Ty : DL->getIndexType(cast<GetElementPtrInst>(VL0) @@ -4159,10 +5251,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ->getScalarType()); // Prepare the operand vector. for (Value *V : VL) { - auto *Op = cast<Instruction>(V)->getOperand(IndexIdx); - auto *CI = cast<ConstantInt>(Op); - Operands.back().push_back(ConstantExpr::getIntegerCast( - CI, Ty, CI->getValue().isSignBitSet())); + auto *I = dyn_cast<GetElementPtrInst>(V); + if (!I) { + Operands.back().push_back( + ConstantInt::get(Ty, 0, /*isSigned=*/false)); + continue; + } + auto *Op = I->getOperand(IndexIdx); + auto *CI = dyn_cast<ConstantInt>(Op); + if (!CI) + Operands.back().push_back(Op); + else + Operands.back().push_back(ConstantExpr::getIntegerCast( + CI, Ty, CI->getValue().isSignBitSet())); } TE->setOperand(IndexIdx, Operands.back()); @@ -4268,7 +5369,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, unsigned NumArgs = CI->arg_size(); SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); for (unsigned j = 0; j != NumArgs; ++j) - if (hasVectorInstrinsicScalarOpd(ID, j)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { CallInst *CI2 = dyn_cast<CallInst>(V); @@ -4287,7 +5388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Some intrinsics have scalar arguments and should be same in order for // them to be vectorized. for (unsigned j = 0; j != NumArgs; ++j) { - if (hasVectorInstrinsicScalarOpd(ID, j)) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) { Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); @@ -4320,7 +5421,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { // For scalar operands no need to to create an entry since no need to // vectorize it. - if (hasVectorInstrinsicScalarOpd(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) continue; ValueList Operands; // Prepare the operand vector. @@ -4347,9 +5448,42 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. - if (isa<BinaryOperator>(VL0)) { + auto *CI = dyn_cast<CmpInst>(VL0); + if (isa<BinaryOperator>(VL0) || CI) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + if (!CI || all_of(VL, [](Value *V) { + return cast<CmpInst>(V)->isCommutative(); + })) { + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + } else { + CmpInst::Predicate P0 = CI->getPredicate(); + CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate(); + assert(P0 != AltP0 && + "Expected different main/alternate predicates."); + CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); + Value *BaseOp0 = VL0->getOperand(0); + Value *BaseOp1 = VL0->getOperand(1); + // Collect operands - commute if it uses the swapped predicate or + // alternate operation. + for (Value *V : VL) { + auto *Cmp = cast<CmpInst>(V); + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + CmpInst::Predicate CurrentPred = Cmp->getPredicate(); + if (P0 == AltP0Swapped) { + if (CI != Cmp && S.AltOp != Cmp && + ((P0 == CurrentPred && + !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || + (AltP0 == CurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))) + std::swap(LHS, RHS); + } else if (P0 != CurrentPred && AltP0 != CurrentPred) { + std::swap(LHS, RHS); + } + Left.push_back(LHS); + Right.push_back(RHS); + } + } TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -4493,7 +5627,9 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const { return (I->hasOneUse() && is_contained(VectorizedVals, I)) || all_of(I->users(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U); + return ScalarToTreeEntry.count(U) > 0 || + isVectorLikeInstWithConstOps(U) || + (isa<ExtractElementInst>(U) && MustGather.contains(U)); }); } @@ -4550,19 +5686,21 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a shuffle // to extract the values into a vector register. + SmallVector<int> RegMask(EltsPerVector, UndefMaskElem); for (auto *V : VL) { ++Idx; - // Need to exclude undefs from analysis. - if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) - continue; - // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { + RegMask.assign(EltsPerVector, UndefMaskElem); AllConsecutive = true; continue; } + // Need to exclude undefs from analysis. + if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) + continue; + // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); @@ -4570,6 +5708,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); AllConsecutive &= PrevIdx + 1 == CurrentIdx && CurrentIdx % EltsPerVector == Idx % EltsPerVector; + RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; } if (AllConsecutive) @@ -4581,10 +5720,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, // If we have a series of extracts which are not consecutive and hence // cannot re-use the source vector register directly, compute the shuffle - // cost to extract the a vector with EltsPerVector elements. + // cost to extract the vector with EltsPerVector elements. Cost += TTI.getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector)); + FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask); } return Cost; } @@ -4592,12 +5731,12 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, /// Build shuffle mask for shuffle graph entries and lists of main and alternate /// operations operands. static void -buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, - ArrayRef<int> ReusesIndices, - const function_ref<bool(Instruction *)> IsAltOp, - SmallVectorImpl<int> &Mask, - SmallVectorImpl<Value *> *OpScalars = nullptr, - SmallVectorImpl<Value *> *AltScalars = nullptr) { +buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, + ArrayRef<int> ReusesIndices, + const function_ref<bool(Instruction *)> IsAltOp, + SmallVectorImpl<int> &Mask, + SmallVectorImpl<Value *> *OpScalars = nullptr, + SmallVectorImpl<Value *> *AltScalars = nullptr) { unsigned Sz = VL.size(); Mask.assign(Sz, UndefMaskElem); SmallVector<int> OrderMask; @@ -4627,6 +5766,29 @@ buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, } } +/// Checks if the specified instruction \p I is an alternate operation for the +/// given \p MainOp and \p AltOp instructions. +static bool isAlternateInstruction(const Instruction *I, + const Instruction *MainOp, + const Instruction *AltOp) { + if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) { + auto *AltCI0 = cast<CmpInst>(AltOp); + auto *CI = cast<CmpInst>(I); + CmpInst::Predicate P0 = CI0->getPredicate(); + CmpInst::Predicate AltP0 = AltCI0->getPredicate(); + assert(P0 != AltP0 && "Expected different main/alternate predicates."); + CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + if (P0 == AltP0Swapped) + return I == AltCI0 || + (I != MainOp && + !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(0), CI->getOperand(1))); + return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; + } + return I->getOpcode() == AltOp->getOpcode(); +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals) { ArrayRef<Value*> VL = E->Scalars; @@ -4740,7 +5902,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, SmallVector<const TreeEntry *> Entries; Optional<TargetTransformInfo::ShuffleKind> Shuffle = isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { + if (Shuffle) { InstructionCost GatherCost = 0; if (ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized @@ -4776,7 +5938,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, SmallVector<int> Mask; Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isFixedVectorShuffle(VL, Mask); - if (ShuffleKind.hasValue()) { + if (ShuffleKind) { // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. @@ -4794,7 +5956,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. assert(VecTy == FinalVecTy && "No reused scalars expected for broadcast."); - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, + /*Mask=*/None, /*Index=*/0, + /*SubTp=*/nullptr, /*Args=*/VL[0]); } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) @@ -4818,8 +5982,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { SmallVector<Value *> PointerOps; OrdersType CurrentOrder; - LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, - *SE, CurrentOrder, PointerOps); + LoadsState LS = + canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, + CurrentOrder, PointerOps); switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: @@ -4909,7 +6074,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); - assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + assert(E->getOpcode() && + ((allSameType(VL) && allSameBlock(VL)) || + (E->getOpcode() == Instruction::GetElementPtr && + E->getMainOp()->getType()->isPointerTy())) && + "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -4981,28 +6150,60 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, assert(E->ReuseShuffleIndices.empty() && "Unique insertelements only are expected."); auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); - unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); + + unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); + + unsigned OffsetBeg = *getInsertIndex(VL.front()); + unsigned OffsetEnd = OffsetBeg; + for (Value *V : VL.drop_front()) { + unsigned Idx = *getInsertIndex(V); + if (OffsetBeg > Idx) + OffsetBeg = Idx; + else if (OffsetEnd < Idx) + OffsetEnd = Idx; + } + unsigned VecScalarsSz = PowerOf2Ceil(NumElts); + if (NumOfParts > 0) + VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); + unsigned VecSz = + (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * + VecScalarsSz; + unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); + unsigned InsertVecSz = std::min<unsigned>( + PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), + ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * + VecScalarsSz); + bool IsWholeSubvector = + OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); + // Check if we can safely insert a subvector. If it is not possible, just + // generate a whole-sized vector and shuffle the source vector and the new + // subvector. + if (OffsetBeg + InsertVecSz > VecSz) { + // Align OffsetBeg to generate correct mask. + OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); + InsertVecSz = VecSz; + } + APInt DemandedElts = APInt::getZero(NumElts); // TODO: Add support for Instruction::InsertValue. SmallVector<int> Mask; if (!E->ReorderIndices.empty()) { inversePermutation(E->ReorderIndices, Mask); - Mask.append(NumElts - NumScalars, UndefMaskElem); + Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); } else { - Mask.assign(NumElts, UndefMaskElem); - std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + Mask.assign(VecSz, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); } - unsigned Offset = *getInsertIndex(VL0, 0); bool IsIdentity = true; - SmallVector<int> PrevMask(NumElts, UndefMaskElem); + SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem); Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); DemandedElts.setBit(InsertIdx); - IsIdentity &= InsertIdx - Offset == I; - Mask[InsertIdx - Offset] = I; + IsIdentity &= InsertIdx - OffsetBeg == I; + Mask[InsertIdx - OffsetBeg] = I; } assert(Offset < NumElts && "Failed to find vector index offset"); @@ -5010,32 +6211,41 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, /*Insert*/ true, /*Extract*/ false); - if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) { - // FIXME: Replace with SK_InsertSubvector once it is properly supported. - unsigned Sz = PowerOf2Ceil(Offset + NumScalars); - Cost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(SrcVecTy->getElementType(), Sz)); - } else if (!IsIdentity) { - auto *FirstInsert = - cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, - cast<Instruction>(V)->getOperand(0)); - })); - if (isUndefVector(FirstInsert->getOperand(0))) { - Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); + // First cost - resize to actual vector size if not identity shuffle or + // need to shift the vector. + // Do not calculate the cost if the actual size is the register size and + // we can merge this shuffle with the following SK_Select. + auto *InsertVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); + if (!IsIdentity) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + InsertVecTy, Mask); + auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); + })); + // Second cost - permutation with subvector, if some elements are from the + // initial vector or inserting a subvector. + // TODO: Implement the analysis of the FirstInsert->getOperand(0) + // subvector of ActualVecTy. + if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts && + !IsWholeSubvector) { + if (InsertVecSz != VecSz) { + auto *ActualVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), VecSz); + Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, + None, OffsetBeg - Offset, InsertVecTy); } else { - SmallVector<int> InsertMask(NumElts); - std::iota(InsertMask.begin(), InsertMask.end(), 0); - for (unsigned I = 0; I < NumElts; I++) { + for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) + Mask[I] = I; + for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; + I <= End; ++I) if (Mask[I] != UndefMaskElem) - InsertMask[Offset + I] = NumElts + I; - } - Cost += - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); + Mask[I] = I + VecSz; + for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) + Mask[I] = I; + Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); } } - return Cost; } case Instruction::ZExt: @@ -5116,9 +6326,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // If the selects are the only uses of the compares, they will be dead // and we can adjust the cost by removing their cost. if (IntrinsicAndUse.second) - IntrinsicCost -= - TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); + IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, + MaskTy, VecPred, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); @@ -5198,7 +6407,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + any_of(VL, + [](Value *V) { + return isa<GetElementPtrInst>(V) && + !isConstant( + cast<GetElementPtrInst>(V)->getOperand(1)); + }) + ? TargetTransformInfo::OK_AnyValue + : TargetTransformInfo::OK_UniformConstantValue; InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); @@ -5229,7 +6445,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, Align CommonAlignment = Alignment; for (Value *V : VL) CommonAlignment = - commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); + std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind, VL0); @@ -5279,7 +6495,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode()))) && + Instruction::isCast(E->getAltOpcode())) || + (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); InstructionCost ScalarCost = 0; if (NeedToShuffleReuses) { @@ -5327,6 +6544,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { + VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + Builder.getInt1Ty(), + CI0->getPredicate(), CostKind, VL0); + VecCost += TTI->getCmpSelInstrCost( + E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, + E->getAltOp()); } else { Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); @@ -5338,16 +6563,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI::CastContextHint::None, CostKind); } - SmallVector<int> Mask; - buildSuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); - }, - Mask); - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask); + if (E->ReuseShuffleIndices.empty()) { + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); + } else { + SmallVector<int> Mask; + buildShuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); + } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost; } @@ -5475,7 +6705,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // No need to vectorize inserts of gathered values. if (VectorizableTree.size() == 2 && isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && - VectorizableTree[1]->State == TreeEntry::NeedToGather) + VectorizableTree[1]->State == TreeEntry::NeedToGather && + (VectorizableTree[1]->getVectorFactor() <= 2 || + !(isSplat(VectorizableTree[1]->Scalars) || + allConstant(VectorizableTree[1]->Scalars)))) return true; // We can vectorize the tree if its size is greater than or equal to the @@ -5605,20 +6838,26 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, return false; auto *IE1 = VU; auto *IE2 = V; + unsigned Idx1 = *getInsertIndex(IE1); + unsigned Idx2 = *getInsertIndex(IE2); // Go through the vector operand of insertelement instructions trying to find // either VU as the original vector for IE2 or V as the original vector for // IE1. do { - if (IE2 == VU || IE1 == V) - return true; + if (IE2 == VU) + return VU->hasOneUse(); + if (IE1 == V) + return V->hasOneUse(); if (IE1) { - if (IE1 != VU && !IE1->hasOneUse()) + if ((IE1 != VU && !IE1->hasOneUse()) || + getInsertIndex(IE1).value_or(Idx2) == Idx2) IE1 = nullptr; else IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); } if (IE2) { - if (IE2 != V && !IE2->hasOneUse()) + if ((IE2 != V && !IE2->hasOneUse()) || + getInsertIndex(IE2).value_or(Idx1) == Idx1) IE2 = nullptr; else IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); @@ -5627,6 +6866,153 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, return false; } +/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the +/// buildvector sequence. +static bool isFirstInsertElement(const InsertElementInst *IE1, + const InsertElementInst *IE2) { + if (IE1 == IE2) + return false; + const auto *I1 = IE1; + const auto *I2 = IE2; + const InsertElementInst *PrevI1; + const InsertElementInst *PrevI2; + unsigned Idx1 = *getInsertIndex(IE1); + unsigned Idx2 = *getInsertIndex(IE2); + do { + if (I2 == IE1) + return true; + if (I1 == IE2) + return false; + PrevI1 = I1; + PrevI2 = I2; + if (I1 && (I1 == IE1 || I1->hasOneUse()) && + getInsertIndex(I1).value_or(Idx2) != Idx2) + I1 = dyn_cast<InsertElementInst>(I1->getOperand(0)); + if (I2 && ((I2 == IE2 || I2->hasOneUse())) && + getInsertIndex(I2).value_or(Idx1) != Idx1) + I2 = dyn_cast<InsertElementInst>(I2->getOperand(0)); + } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2)); + llvm_unreachable("Two different buildvectors not expected."); +} + +namespace { +/// Returns incoming Value *, if the requested type is Value * too, or a default +/// value, otherwise. +struct ValueSelect { + template <typename U> + static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type + get(Value *V) { + return V; + } + template <typename U> + static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type + get(Value *) { + return U(); + } +}; +} // namespace + +/// Does the analysis of the provided shuffle masks and performs the requested +/// actions on the vectors with the given shuffle masks. It tries to do it in +/// several steps. +/// 1. If the Base vector is not undef vector, resizing the very first mask to +/// have common VF and perform action for 2 input vectors (including non-undef +/// Base). Other shuffle masks are combined with the resulting after the 1 stage +/// and processed as a shuffle of 2 elements. +/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the +/// action only for 1 vector with the given mask, if it is not the identity +/// mask. +/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 +/// vectors, combing the masks properly between the steps. +template <typename T> +static T *performExtractsShuffleAction( + MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base, + function_ref<unsigned(T *)> GetVF, + function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction, + function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) { + assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); + SmallVector<int> Mask(ShuffleMask.begin()->second); + auto VMIt = std::next(ShuffleMask.begin()); + T *Prev = nullptr; + bool IsBaseNotUndef = !isUndefVector(Base); + if (IsBaseNotUndef) { + // Base is not undef, need to combine it with the next subvectors. + std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask); + for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { + if (Mask[Idx] == UndefMaskElem) + Mask[Idx] = Idx; + else + Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; + } + auto *V = ValueSelect::get<T *>(Base); + (void)V; + assert((!V || GetVF(V) == Mask.size()) && + "Expected base vector of VF number of elements."); + Prev = Action(Mask, {nullptr, Res.first}); + } else if (ShuffleMask.size() == 1) { + // Base is undef and only 1 vector is shuffled - perform the action only for + // single vector, if the mask is not the identity mask. + std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask); + if (Res.second) + // Identity mask is found. + Prev = Res.first; + else + Prev = Action(Mask, {ShuffleMask.begin()->first}); + } else { + // Base is undef and at least 2 input vectors shuffled - perform 2 vectors + // shuffles step by step, combining shuffle between the steps. + unsigned Vec1VF = GetVF(ShuffleMask.begin()->first); + unsigned Vec2VF = GetVF(VMIt->first); + if (Vec1VF == Vec2VF) { + // No need to resize the input vectors since they are of the same size, we + // can shuffle them directly. + ArrayRef<int> SecMask = VMIt->second; + for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { + if (SecMask[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars."); + Mask[I] = SecMask[I] + Vec1VF; + } + } + Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); + } else { + // Vectors of different sizes - resize and reshuffle. + std::pair<T *, bool> Res1 = + ResizeAction(ShuffleMask.begin()->first, Mask); + std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second); + ArrayRef<int> SecMask = VMIt->second; + for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { + if (Mask[I] != UndefMaskElem) { + assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars."); + if (Res1.second) + Mask[I] = I; + } else if (SecMask[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars."); + Mask[I] = (Res2.second ? I : SecMask[I]) + VF; + } + } + Prev = Action(Mask, {Res1.first, Res2.first}); + } + VMIt = std::next(VMIt); + } + // Perform requested actions for the remaining masks/vectors. + for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { + // Shuffle other input vectors, if any. + std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second); + ArrayRef<int> SecMask = VMIt->second; + for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { + if (SecMask[I] != UndefMaskElem) { + assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) && + "Multiple uses of scalars."); + Mask[I] = (Res.second ? I : SecMask[I]) + VF; + } else if (Mask[I] != UndefMaskElem) { + Mask[I] = I; + } + } + Prev = Action(Mask, {Prev, Res.first}); + } + return Prev; +} + InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " @@ -5635,7 +7021,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + TreeEntry &TE = *VectorizableTree[I]; InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; @@ -5647,9 +7033,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { SmallPtrSet<Value *, 16> ExtractCostCalculated; InstructionCost ExtractCost = 0; - SmallVector<unsigned> VF; - SmallVector<SmallVector<int>> ShuffleMask; - SmallVector<Value *> FirstUsers; + SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks; + SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers; SmallVector<APInt> DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. @@ -5678,37 +7063,55 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { Optional<unsigned> InsertIdx = getInsertIndex(VU); if (InsertIdx) { - auto *It = find_if(FirstUsers, [VU](Value *V) { - return areTwoInsertFromSameBuildVector(VU, - cast<InsertElementInst>(V)); - }); + const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); + auto *It = + find_if(FirstUsers, + [VU](const std::pair<Value *, const TreeEntry *> &Pair) { + return areTwoInsertFromSameBuildVector( + VU, cast<InsertElementInst>(Pair.first)); + }); int VecId = -1; if (It == FirstUsers.end()) { - VF.push_back(FTy->getNumElements()); - ShuffleMask.emplace_back(VF.back(), UndefMaskElem); + (void)ShuffleMasks.emplace_back(); + SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE]; + if (Mask.empty()) + Mask.assign(FTy->getNumElements(), UndefMaskElem); // Find the insertvector, vectorized in tree, if any. Value *Base = VU; - while (isa<InsertElementInst>(Base)) { + while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { + if (IEBase != EU.User && + (!IEBase->hasOneUse() || + getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx)) + break; // Build the mask for the vectorized insertelement instructions. - if (const TreeEntry *E = getTreeEntry(Base)) { - VU = cast<InsertElementInst>(Base); + if (const TreeEntry *E = getTreeEntry(IEBase)) { + VU = IEBase; do { - int Idx = E->findLaneForValue(Base); - ShuffleMask.back()[Idx] = Idx; - Base = cast<InsertElementInst>(Base)->getOperand(0); + IEBase = cast<InsertElementInst>(Base); + int Idx = *getInsertIndex(IEBase); + assert(Mask[Idx] == UndefMaskElem && + "InsertElementInstruction used already."); + Mask[Idx] = Idx; + Base = IEBase->getOperand(0); } while (E == getTreeEntry(Base)); break; } Base = cast<InsertElementInst>(Base)->getOperand(0); } - FirstUsers.push_back(VU); - DemandedElts.push_back(APInt::getZero(VF.back())); + FirstUsers.emplace_back(VU, ScalarTE); + DemandedElts.push_back(APInt::getZero(FTy->getNumElements())); VecId = FirstUsers.size() - 1; } else { + if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first))) + It->first = VU; VecId = std::distance(FirstUsers.begin(), It); } - ShuffleMask[VecId][*InsertIdx] = EU.Lane; - DemandedElts[VecId].setBit(*InsertIdx); + int InIdx = *InsertIdx; + SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE]; + if (Mask.empty()) + Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask[InIdx] = EU.Lane; + DemandedElts[VecId].setBit(InIdx); continue; } } @@ -5734,86 +7137,75 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - if (FirstUsers.size() == 1) { - int Limit = ShuffleMask.front().size() * 2; - if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { - InstructionCost C = TTI->getShuffleCost( + auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) { + InstructionCost C = 0; + unsigned VF = Mask.size(); + unsigned VecVF = TE->getVectorFactor(); + if (VF != VecVF && + (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) || + (all_of(Mask, + [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) && + !ShuffleVectorInst::isIdentityMask(Mask)))) { + SmallVector<int> OrigMask(VecVF, UndefMaskElem); + std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), + OrigMask.begin()); + C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast<FixedVectorType>(FirstUsers.front()->getType()), - ShuffleMask.front()); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of insertelement external users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); + FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask); + LLVM_DEBUG( + dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users.\n"; + TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); Cost += C; + return std::make_pair(TE, true); } + return std::make_pair(TE, false); + }; + // Calculate the cost of the reshuffled vectors, if any. + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0); + unsigned VF = ShuffleMasks[I].begin()->second.size(); + auto *FTy = FixedVectorType::get( + cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF); + auto Vector = ShuffleMasks[I].takeVector(); + auto &&EstimateShufflesCost = [this, FTy, + &Cost](ArrayRef<int> Mask, + ArrayRef<const TreeEntry *> TEs) { + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected exactly 1 or 2 tree entries."); + if (TEs.size() == 1) { + int Limit = 2 * Mask.size(); + if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) || + !ShuffleVectorInst::isIdentityMask(Mask)) { + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement " + "external users.\n"; + TEs.front()->dump(); + dbgs() << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } + } else { + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users.\n"; + if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump(); + dbgs() << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } + return TEs.back(); + }; + (void)performExtractsShuffleAction<const TreeEntry>( + makeMutableArrayRef(Vector.data(), Vector.size()), Base, + [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF, + EstimateShufflesCost); InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers.front()->getType()), - DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost -= InsertCost; - } else if (FirstUsers.size() >= 2) { - unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); - // Combined masks of the first 2 vectors. - SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); - copy(ShuffleMask.front(), CombinedMask.begin()); - APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); - auto *VecTy = FixedVectorType::get( - cast<VectorType>(FirstUsers.front()->getType())->getElementType(), - MaxVF); - for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { - if (ShuffleMask[1][I] != UndefMaskElem) { - CombinedMask[I] = ShuffleMask[1][I] + MaxVF; - CombinedDemandedElts.setBit(I); - } - } - InstructionCost C = - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of vector node and external " - "insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - InstructionCost InsertCost = TTI->getScalarizationOverhead( - VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); + cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); Cost -= InsertCost; - for (int I = 2, E = FirstUsers.size(); I < E; ++I) { - // Other elements - permutation of 2 vectors (the initial one and the - // next Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int Mask = ShuffleMask[I][Idx]; - if (Mask != UndefMaskElem) - CombinedMask[Idx] = MaxVF + Mask; - else if (CombinedMask[Idx] != UndefMaskElem) - CombinedMask[Idx] = Idx; - } - for (unsigned Idx = VF; Idx < MaxVF; ++Idx) - if (CombinedMask[Idx] != UndefMaskElem) - CombinedMask[Idx] = Idx; - InstructionCost C = - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of vector node and external " - "insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, /*Extract*/ false); - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost -= InsertCost; - } } #ifndef NDEBUG @@ -5906,6 +7298,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, } } + if (UsedTEs.empty()) { + assert(all_of(TE->Scalars, UndefValue::classof) && + "Expected vector of undefs only."); + return None; + } + unsigned VF = 0; if (UsedTEs.size() == 1) { // Try to find the perfect match in another gather node at first. @@ -5965,17 +7363,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, return None; } -InstructionCost -BoUpSLP::getGatherCost(FixedVectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices, - bool NeedToShuffle) const { - unsigned NumElts = Ty->getNumElements(); - APInt DemandedElts = APInt::getZero(NumElts); - for (unsigned I = 0; I < NumElts; ++I) - if (!ShuffledIndices.count(I)) - DemandedElts.setBit(I); +InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, + const APInt &ShuffledIndices, + bool NeedToShuffle) const { InstructionCost Cost = - TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, + TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, /*Extract*/ false); if (NeedToShuffle) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); @@ -5992,19 +7384,19 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. - DenseSet<unsigned> ShuffledElements; + APInt ShuffledElements = APInt::getZero(VL.size()); DenseSet<Value *> UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; // No need to shuffle duplicates for constants. if (isConstant(VL[Idx])) { - ShuffledElements.insert(Idx); + ShuffledElements.setBit(Idx); continue; } if (!UniqueElements.insert(VL[Idx]).second) { DuplicateNonConst = true; - ShuffledElements.insert(Idx); + ShuffledElements.setBit(Idx); } } return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); @@ -6029,14 +7421,83 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle - // should be in this block. + // should be in this block (except for extractelement-like instructions with + // constant indeces). auto *Front = E->getMainOp(); auto *BB = Front->getParent(); assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { + if (E->getOpcode() == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(V)) + return true; auto *I = cast<Instruction>(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + return !E->isOpcodeOrAlt(I) || I->getParent() == BB || + isVectorLikeInstWithConstOps(I); })); + auto &&FindLastInst = [E, Front, this, &BB]() { + Instruction *LastInst = Front; + for (Value *V : E->Scalars) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + if (LastInst->getParent() == I->getParent()) { + if (LastInst->comesBefore(I)) + LastInst = I; + continue; + } + assert(isVectorLikeInstWithConstOps(LastInst) && + isVectorLikeInstWithConstOps(I) && + "Expected vector-like insts only."); + if (!DT->isReachableFromEntry(LastInst->getParent())) { + LastInst = I; + continue; + } + if (!DT->isReachableFromEntry(I->getParent())) + continue; + auto *NodeA = DT->getNode(LastInst->getParent()); + auto *NodeB = DT->getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) + LastInst = I; + } + BB = LastInst->getParent(); + return LastInst; + }; + + auto &&FindFirstInst = [E, Front]() { + Instruction *FirstInst = Front; + for (Value *V : E->Scalars) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + if (I->comesBefore(FirstInst)) + FirstInst = I; + } + return FirstInst; + }; + + // Set the insert point to the beginning of the basic block if the entry + // should not be scheduled. + if (E->State != TreeEntry::NeedToGather && + doesNotNeedToSchedule(E->Scalars)) { + Instruction *InsertInst; + if (all_of(E->Scalars, isUsedOutsideBlock)) + InsertInst = FindLastInst(); + else + InsertInst = FindFirstInst(); + // If the instruction is PHI, set the insert point after all the PHIs. + if (isa<PHINode>(InsertInst)) + InsertInst = BB->getFirstNonPHI(); + BasicBlock::iterator InsertPt = InsertInst->getIterator(); + Builder.SetInsertPoint(BB, InsertPt); + Builder.SetCurrentDebugLocation(Front->getDebugLoc()); + return; + } + // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -6045,8 +7506,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); + Value *V = E->isOneOf(E->Scalars.back()); + if (doesNotNeedToBeScheduled(V)) + V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); + auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -6072,19 +7535,16 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end()); - for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) - LastInst = &I; - if (Bundle.empty()) - break; - } + LastInst = FindLastInst(); + // If the instruction is PHI, set the insert point after all the PHIs. + if (isa<PHINode>(LastInst)) + LastInst = BB->getFirstNonPHI()->getPrevNode(); } assert(LastInst && "Failed to find last instruction in bundle"); // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. - Builder.SetInsertPoint(BB, ++LastInst->getIterator()); + Builder.SetInsertPoint(BB, std::next(LastInst->getIterator())); Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } @@ -6214,8 +7674,15 @@ public: } // namespace Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { - unsigned VF = VL.size(); + const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL); + // Special processing for GEPs bundle, which may include non-gep values. + if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { + const auto *It = + find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }); + if (It != VL.end()) + S = getSameOpcode(*It); + } if (S.getOpcode()) { if (TreeEntry *E = getTreeEntry(S.OpValue)) if (E->isSame(VL)) { @@ -6270,7 +7737,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { } } - // Check that every instruction appears once in this bundle. + // Can't vectorize this, so simply build a new vector with each lane + // corresponding to the requested value. + return createBuildVector(VL); +} +Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) { + assert(any_of(VectorizableTree, + [VL](const std::unique_ptr<TreeEntry> &TE) { + return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); + }) && + "Non-matching gather node."); + unsigned VF = VL.size(); + // Exploit possible reuse of values across lanes. SmallVector<int> ReuseShuffleIndicies; SmallVector<Value *> UniqueValues; if (VL.size() > 2) { @@ -6303,6 +7781,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), UndefMaskElem); } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { + if (UniqueValues.empty()) { + assert(all_of(VL, UndefValue::classof) && "Expected list of undefs."); + NumValues = VF; + } ReuseShuffleIndicies.clear(); UniqueValues.clear(); UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); @@ -6342,7 +7824,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<const TreeEntry *> Entries; Optional<TargetTransformInfo::ShuffleKind> Shuffle = isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { + if (Shuffle) { assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, @@ -6376,14 +7858,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { - assert( - (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) && - "PHI reordering is free."); + assert((E->ReorderIndices.empty() || + E != VectorizableTree.front().get() || + !E->UserTreeIndices.empty()) && + "PHI reordering is free."); auto *PH = cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; + + // Adjust insertion point once all PHI's have been generated. + Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -6449,7 +7937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { cast<FixedVectorType>(FirstInsert->getType())->getNumElements(); const unsigned NumScalars = E->Scalars.size(); - unsigned Offset = *getInsertIndex(VL0, 0); + unsigned Offset = *getInsertIndex(VL0); assert(Offset < NumElts && "Failed to find vector index offset"); // Create shuffle to resize vector @@ -6656,19 +8144,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned AS = LI->getPointerAddressSpace(); Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { - Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); + NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); // The pointer operand uses an in-tree scalar so we add the new BitCast - // to ExternalUses list to make sure that an extract will be generated - // in the future. + // or LoadInst to ExternalUses list to make sure that an extract will + // be generated in the future. if (TreeEntry *Entry = getTreeEntry(PO)) { // Find which lane we need to extract. unsigned FoundLane = Entry->findLaneForValue(PO); - ExternalUses.emplace_back(PO, cast<User>(VecPtr), FoundLane); + ExternalUses.emplace_back( + PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane); } - - NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); Value *VecPtr = vectorizeTree(E->getOperand(0)); @@ -6676,7 +8163,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Align CommonAlignment = LI->getAlign(); for (Value *V : E->Scalars) CommonAlignment = - commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); + std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } Value *V = propagateMetadata(NewLI, E->Scalars); @@ -6701,17 +8188,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( ScalarPtr, VecValue->getType()->getPointerTo(AS)); - StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, - SI->getAlign()); + StoreInst *ST = + Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign()); - // The pointer operand uses an in-tree scalar, so add the new BitCast to - // ExternalUses to make sure that an extract will be generated in the - // future. + // The pointer operand uses an in-tree scalar, so add the new BitCast or + // StoreInst to ExternalUses to make sure that an extract will be + // generated in the future. if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) { // Find which lane we need to extract. unsigned FoundLane = Entry->findLaneForValue(ScalarPtr); - ExternalUses.push_back( - ExternalUser(ScalarPtr, cast<User>(VecPtr), FoundLane)); + ExternalUses.push_back(ExternalUser( + ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST, + FoundLane)); } Value *V = propagateMetadata(ST, E->Scalars); @@ -6733,8 +8221,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); - if (Instruction *I = dyn_cast<Instruction>(V)) - V = propagateMetadata(I, E->Scalars); + if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) { + SmallVector<Value *> GEPs; + for (Value *V : E->Scalars) { + if (isa<GetElementPtrInst>(V)) + GEPs.push_back(V); + } + V = propagateMetadata(I, GEPs); + } ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); @@ -6767,11 +8261,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { + if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); - if (hasVectorInstrinsicOverloadedScalarOpd(IID, j)) + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) TysForDecl.push_back(ScalarArg->getType()); continue; } @@ -6779,6 +8273,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *OpVec = vectorizeTree(E->getOperand(j)); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) + TysForDecl.push_back(OpVec->getType()); } Function *CF; @@ -6822,11 +8318,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode()))) && + Instruction::isCast(E->getAltOpcode())) || + (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); Value *LHS = nullptr, *RHS = nullptr; - if (Instruction::isBinaryOp(E->getOpcode())) { + if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); @@ -6846,6 +8343,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); + } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { + V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); + auto *AltCI = cast<CmpInst>(E->getAltOp()); + CmpInst::Predicate AltPred = AltCI->getPredicate(); + V1 = Builder.CreateCmp(AltPred, LHS, RHS); } else { V0 = Builder.CreateCast( static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); @@ -6866,11 +8368,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // each vector operation. ValueList OpScalars, AltScalars; SmallVector<int> Mask; - buildSuffleEntryMask( + buildShuffleEntryMask( E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); + return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); }, Mask, &OpScalars, &AltScalars); @@ -6901,6 +8403,17 @@ Value *BoUpSLP::vectorizeTree() { return vectorizeTree(ExternallyUsedValues); } +namespace { +/// Data type for handling buildvector sequences with the reused scalars from +/// other tree entries. +struct ShuffledInsertData { + /// List of insertelements to be replaced by shuffles. + SmallVector<InsertElementInst *> InsertElements; + /// The parent vectors and shuffle mask for the given list of inserts. + MapVector<Value *, SmallVector<int>> ValueMasks; +}; +} // namespace + Value * BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. @@ -6934,6 +8447,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + SmallVector<ShuffledInsertData> ShuffledInserts; + // Maps vector instruction to original insertelement instruction + DenseMap<Value *, InsertElementInst *> VectorToInsertElement; // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -6947,6 +8463,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { assert(E && "Invalid scalar"); assert(E->State != TreeEntry::NeedToGather && "Extracting from a gather list"); + // Non-instruction pointers are not deleted, just skip them. + if (E->getOpcode() == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(Scalar)) + continue; Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -6973,6 +8493,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { assert(isa<FixedVectorType>(Scalar->getType()) && isa<InsertElementInst>(Scalar) && "In-tree scalar of vector type is not insertelement?"); + auto *IE = cast<InsertElementInst>(Scalar); + VectorToInsertElement.try_emplace(Vec, IE); return Vec; }; // If User == nullptr, the Scalar is used as extra arg. Generate @@ -7001,6 +8523,69 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; } + if (auto *VU = dyn_cast<InsertElementInst>(User)) { + // Skip if the scalar is another vector op or Vec is not an instruction. + if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) { + if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) { + Optional<unsigned> InsertIdx = getInsertIndex(VU); + if (InsertIdx) { + // Need to use original vector, if the root is truncated. + if (MinBWs.count(Scalar) && + VectorizableTree[0]->VectorizedValue == Vec) + Vec = VectorRoot; + auto *It = + find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) { + // Checks if 2 insertelements are from the same buildvector. + InsertElementInst *VecInsert = Data.InsertElements.front(); + return areTwoInsertFromSameBuildVector(VU, VecInsert); + }); + unsigned Idx = *InsertIdx; + if (It == ShuffledInserts.end()) { + (void)ShuffledInserts.emplace_back(); + It = std::next(ShuffledInserts.begin(), + ShuffledInserts.size() - 1); + SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; + if (Mask.empty()) + Mask.assign(FTy->getNumElements(), UndefMaskElem); + // Find the insertvector, vectorized in tree, if any. + Value *Base = VU; + while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { + if (IEBase != User && + (!IEBase->hasOneUse() || + getInsertIndex(IEBase).value_or(Idx) == Idx)) + break; + // Build the mask for the vectorized insertelement instructions. + if (const TreeEntry *E = getTreeEntry(IEBase)) { + do { + IEBase = cast<InsertElementInst>(Base); + int IEIdx = *getInsertIndex(IEBase); + assert(Mask[Idx] == UndefMaskElem && + "InsertElementInstruction used already."); + Mask[IEIdx] = IEIdx; + Base = IEBase->getOperand(0); + } while (E == getTreeEntry(Base)); + break; + } + Base = cast<InsertElementInst>(Base)->getOperand(0); + // After the vectorization the def-use chain has changed, need + // to look through original insertelement instructions, if they + // get replaced by vector instructions. + auto It = VectorToInsertElement.find(Base); + if (It != VectorToInsertElement.end()) + Base = It->second; + } + } + SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; + if (Mask.empty()) + Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask[Idx] = ExternalUse.Lane; + It->InsertElements.push_back(cast<InsertElementInst>(User)); + continue; + } + } + } + } + // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. if (auto *VecI = dyn_cast<Instruction>(Vec)) { @@ -7036,6 +8621,221 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } + // Checks if the mask is an identity mask. + auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) { + int Limit = Mask.size(); + return VecTy->getNumElements() == Mask.size() && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask); + }; + // Tries to combine 2 different masks into single one. + auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) { + SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + NewMask[I] = Mask[ExtMask[I]]; + } + Mask.swap(NewMask); + }; + // Peek through shuffles, trying to simplify the final shuffle code. + auto &&PeekThroughShuffles = + [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask, + bool CheckForLengthChange = false) { + while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) { + // Exit if not a fixed vector type or changing size shuffle. + if (!isa<FixedVectorType>(SV->getType()) || + (CheckForLengthChange && SV->changesLength())) + break; + // Exit if the identity or broadcast mask is found. + if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) || + SV->isZeroEltSplat()) + break; + bool IsOp1Undef = isUndefVector(SV->getOperand(0)); + bool IsOp2Undef = isUndefVector(SV->getOperand(1)); + if (!IsOp1Undef && !IsOp2Undef) + break; + SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(), + SV->getShuffleMask().end()); + CombineMasks(ShuffleMask, Mask); + Mask.swap(ShuffleMask); + if (IsOp2Undef) + V = SV->getOperand(0); + else + V = SV->getOperand(1); + } + }; + // Smart shuffle instruction emission, walks through shuffles trees and + // tries to find the best matching vector for the actual shuffle + // instruction. + auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles, + &CombineMasks](Value *V1, Value *V2, + ArrayRef<int> Mask) -> Value * { + assert(V1 && "Expected at least one vector value."); + if (V2 && !isUndefVector(V2)) { + // Peek through shuffles. + Value *Op1 = V1; + Value *Op2 = V2; + int VF = + cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); + SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; + } + Value *PrevOp1; + Value *PrevOp2; + do { + PrevOp1 = Op1; + PrevOp2 = Op2; + PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true); + PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true); + // Check if we have 2 resizing shuffles - need to peek through operands + // again. + if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) + if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) + if (SV1->getOperand(0)->getType() == + SV2->getOperand(0)->getType() && + SV1->getOperand(0)->getType() != SV1->getType() && + isUndefVector(SV1->getOperand(1)) && + isUndefVector(SV2->getOperand(1))) { + Op1 = SV1->getOperand(0); + Op2 = SV2->getOperand(0); + SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(), + SV1->getShuffleMask().end()); + CombineMasks(ShuffleMask1, CombinedMask1); + CombinedMask1.swap(ShuffleMask1); + SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(), + SV2->getShuffleMask().end()); + CombineMasks(ShuffleMask2, CombinedMask2); + CombinedMask2.swap(ShuffleMask2); + } + } while (PrevOp1 != Op1 || PrevOp2 != Op2); + VF = cast<VectorType>(Op1->getType()) + ->getElementCount() + .getKnownMinValue(); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (CombinedMask2[I] != UndefMaskElem) { + assert(CombinedMask1[I] == UndefMaskElem && + "Expected undefined mask element"); + CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); + } + } + Value *Vec = Builder.CreateShuffleVector( + Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, + CombinedMask1); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + if (isa<PoisonValue>(V1)) + return PoisonValue::get(FixedVectorType::get( + cast<VectorType>(V1->getType())->getElementType(), Mask.size())); + Value *Op = V1; + SmallVector<int> CombinedMask(Mask.begin(), Mask.end()); + PeekThroughShuffles(Op, CombinedMask); + if (!isa<FixedVectorType>(Op->getType()) || + !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) { + Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + return Op; + }; + + auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) { + unsigned VF = Mask.size(); + unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); + if (VF != VecVF) { + if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) { + Vec = CreateShuffle(Vec, nullptr, Mask); + return std::make_pair(Vec, true); + } + SmallVector<int> ResizeMask(VF, UndefMaskElem); + for (unsigned I = 0; I < VF; ++I) { + if (Mask[I] != UndefMaskElem) + ResizeMask[Mask[I]] = Mask[I]; + } + Vec = CreateShuffle(Vec, nullptr, ResizeMask); + } + + return std::make_pair(Vec, false); + }; + // Perform shuffling of the vectorize tree entries for better handling of + // external extracts. + for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { + // Find the first and the last instruction in the list of insertelements. + sort(ShuffledInserts[I].InsertElements, isFirstInsertElement); + InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front(); + InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back(); + Builder.SetInsertPoint(LastInsert); + auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); + Value *NewInst = performExtractsShuffleAction<Value>( + makeMutableArrayRef(Vector.data(), Vector.size()), + FirstInsert->getOperand(0), + [](Value *Vec) { + return cast<VectorType>(Vec->getType()) + ->getElementCount() + .getKnownMinValue(); + }, + ResizeToVF, + [FirstInsert, &CreateShuffle](ArrayRef<int> Mask, + ArrayRef<Value *> Vals) { + assert((Vals.size() == 1 || Vals.size() == 2) && + "Expected exactly 1 or 2 input values."); + if (Vals.size() == 1) { + // Do not create shuffle if the mask is a simple identity + // non-resizing mask. + if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType()) + ->getNumElements() || + !ShuffleVectorInst::isIdentityMask(Mask)) + return CreateShuffle(Vals.front(), nullptr, Mask); + return Vals.front(); + } + return CreateShuffle(Vals.front() ? Vals.front() + : FirstInsert->getOperand(0), + Vals.back(), Mask); + }); + auto It = ShuffledInserts[I].InsertElements.rbegin(); + // Rebuild buildvector chain. + InsertElementInst *II = nullptr; + if (It != ShuffledInserts[I].InsertElements.rend()) + II = *It; + SmallVector<Instruction *> Inserts; + while (It != ShuffledInserts[I].InsertElements.rend()) { + assert(II && "Must be an insertelement instruction."); + if (*It == II) + ++It; + else + Inserts.push_back(cast<Instruction>(II)); + II = dyn_cast<InsertElementInst>(II->getOperand(0)); + } + for (Instruction *II : reverse(Inserts)) { + II->replaceUsesOfWith(II->getOperand(0), NewInst); + if (auto *NewI = dyn_cast<Instruction>(NewInst)) + if (II->getParent() == NewI->getParent() && II->comesBefore(NewI)) + II->moveAfter(NewI); + NewInst = II; + } + LastInsert->replaceAllUsesWith(NewInst); + for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) { + IE->replaceUsesOfWith(IE->getOperand(0), + PoisonValue::get(IE->getOperand(0)->getType())); + IE->replaceUsesOfWith(IE->getOperand(1), + PoisonValue::get(IE->getOperand(1)->getType())); + eraseInstruction(IE); + } + CSEBlocks.insert(LastInsert->getParent()); + } + // For each vectorized value: for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -7050,6 +8850,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (Entry->getOpcode() == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(Scalar)) + continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -7057,7 +8860,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to delete users in the ignorelist. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) || + assert((getTreeEntry(U) || + (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull<Instruction>(U) && isDeleted(cast<Instruction>(U)))) && "Deleting out-of-tree value"); @@ -7225,9 +9029,11 @@ void BoUpSLP::optimizeGatherSequence() { BoUpSLP::ScheduleData * BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { - ScheduleData *Bundle = nullptr; + ScheduleData *Bundle = nullptr; ScheduleData *PrevInBundle = nullptr; for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " @@ -7239,8 +9045,6 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { } else { Bundle = BundleMember; } - BundleMember->UnscheduledDepsInBundle = 0; - Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; // Group the instructions to a bundle. BundleMember->FirstInBundle = Bundle; @@ -7257,7 +9061,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. - if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue)) + if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || + doesNotNeedToSchedule(VL)) return nullptr; // Initialize the instruction bundle. @@ -7276,16 +9081,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); ReSchedule = true; } - if (ReSchedule) { - resetSchedule(); - initialFillReadyList(ReadyInsts); - } if (Bundle) { LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); } + if (ReSchedule) { + resetSchedule(); + initialFillReadyList(ReadyInsts); + } + // Now try to schedule the new bundle or (if no bundle) just calculate // dependencies. As soon as the bundle is "ready" it means that there are no // cyclic dependencies and we can schedule it. Note that's important that we @@ -7293,14 +9099,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && !ReadyInsts.empty()) { ScheduleData *Picked = ReadyInsts.pop_back_val(); - if (Picked->isSchedulingEntity() && Picked->isReady()) - schedule(Picked, ReadyInsts); + assert(Picked->isSchedulingEntity() && Picked->isReady() && + "must be ready to schedule"); + schedule(Picked, ReadyInsts); } }; // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it // is a new region for the first bundle). This makes it necessary to @@ -7315,9 +9124,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); + + // Make sure we don't leave the pieces of the bundle in the ready list when + // whole bundle might not be ready. + ReadyInsts.remove(BundleMember); + if (!BundleMember->IsScheduled) continue; // A bundle member was scheduled as single instruction before and now @@ -7339,16 +9155,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, Value *OpValue) { - if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue)) + if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) || + doesNotNeedToSchedule(VL)) return; + if (doesNotNeedToBeScheduled(OpValue)) + OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); ScheduleData *Bundle = getScheduleData(OpValue); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && + assert(Bundle->isSchedulingEntity() && + (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && "tried to unbundle something which is not a bundle"); + // Remove the bundle from the ready list. + if (Bundle->isReady()) + ReadyInsts.remove(Bundle); + // Un-bundle: make single instructions out of the bundle. ScheduleData *BundleMember = Bundle; while (BundleMember) { @@ -7356,8 +9180,8 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; - BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; - if (BundleMember->UnscheduledDepsInBundle == 0) { + BundleMember->TE = nullptr; + if (BundleMember->unscheduledDepsInBundle() == 0) { ReadyInsts.insert(BundleMember); } BundleMember = Next; @@ -7380,9 +9204,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && + !doesNotNeedToBeScheduled(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to " "be scheduled"); - auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { + auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool { ScheduleData *ISD = getScheduleData(I); if (!ISD) return false; @@ -7394,7 +9219,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, ExtraScheduleDataMap[I][S.OpValue] = SD; return true; }; - if (CheckSheduleForI(I)) + if (CheckScheduleForI(I)) return true; if (!ScheduleStart) { // It's the first instruction in the new region. @@ -7402,7 +9227,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, ScheduleStart = I; ScheduleEnd = I->getNextNode(); if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckScheduleForI(I); assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; @@ -7430,7 +9255,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckScheduleForI(I); LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; @@ -7444,7 +9269,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, nullptr); ScheduleEnd = I->getNextNode(); if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckScheduleForI(I); assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; @@ -7456,7 +9281,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, ScheduleData *NextLoadStore) { ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - ScheduleData *SD = ScheduleDataMap[I]; + // No need to allocate data for non-schedulable instructions. + if (doesNotNeedToBeScheduled(I)) + continue; + ScheduleData *SD = ScheduleDataMap.lookup(I); if (!SD) { SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; @@ -7479,6 +9307,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, } CurrentLoadStore = SD; } + + if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || + match(I, m_Intrinsic<Intrinsic::stackrestore>())) + RegionHasStackSave = true; } if (NextLoadStore) { if (CurrentLoadStore) @@ -7511,8 +9343,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, // Handle def-use chain dependencies. if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; if (!DestBundle->IsScheduled) @@ -7522,10 +9353,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } else { for (User *U : BundleMember->Inst->users()) { - assert(isa<Instruction>(U) && - "user of instruction must be instruction"); - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; if (!DestBundle->IsScheduled) @@ -7536,6 +9364,75 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } + auto makeControlDependent = [&](Instruction *I) { + auto *DepDest = getScheduleData(I); + assert(DepDest && "must be in schedule window"); + DepDest->ControlDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + }; + + // Any instruction which isn't safe to speculate at the begining of the + // block is control dependend on any early exit or non-willreturn call + // which proceeds it. + if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) { + for (Instruction *I = BundleMember->Inst->getNextNode(); + I != ScheduleEnd; I = I->getNextNode()) { + if (isSafeToSpeculativelyExecute(I, &*BB->begin())) + continue; + + // Add the dependency + makeControlDependent(I); + + if (!isGuaranteedToTransferExecutionToSuccessor(I)) + // Everything past here must be control dependent on I. + break; + } + } + + if (RegionHasStackSave) { + // If we have an inalloc alloca instruction, it needs to be scheduled + // after any preceeding stacksave. We also need to prevent any alloca + // from reordering above a preceeding stackrestore. + if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) || + match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) { + for (Instruction *I = BundleMember->Inst->getNextNode(); + I != ScheduleEnd; I = I->getNextNode()) { + if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || + match(I, m_Intrinsic<Intrinsic::stackrestore>())) + // Any allocas past here must be control dependent on I, and I + // must be memory dependend on BundleMember->Inst. + break; + + if (!isa<AllocaInst>(I)) + continue; + + // Add the dependency + makeControlDependent(I); + } + } + + // In addition to the cases handle just above, we need to prevent + // allocas from moving below a stacksave. The stackrestore case + // is currently thought to be conservatism. + if (isa<AllocaInst>(BundleMember->Inst)) { + for (Instruction *I = BundleMember->Inst->getNextNode(); + I != ScheduleEnd; I = I->getNextNode()) { + if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) && + !match(I, m_Intrinsic<Intrinsic::stackrestore>())) + continue; + + // Add the dependency + makeControlDependent(I); + break; + } + } + } + // Handle the memory dependencies (if any). ScheduleData *DepDest = BundleMember->NextLoadStore; if (!DepDest) @@ -7598,7 +9495,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } if (InsertInReadyList && SD->isReady()) { - ReadyInsts.push_back(SD); + ReadyInsts.insert(SD); LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); } @@ -7625,11 +9522,18 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); + // A key point - if we got here, pre-scheduling was able to find a valid + // scheduling of the sub-graph of the scheduling window which consists + // of all vector bundles and their transitive users. As such, we do not + // need to reschedule anything *outside of* that subgraph. + BS->resetSchedule(); // For the real scheduling we use a more sophisticated ready-list: it is // sorted by the original instruction location. This lets the final schedule // be as close as possible to the original instruction order. + // WARNING: If changing this order causes a correctness issue, that means + // there is some missing dependence edge in the schedule data graph. struct ScheduleDataCompare { bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { return SD2->SchedulingPriority < SD1->SchedulingPriority; @@ -7637,21 +9541,22 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { }; std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; - // Ensure that all dependency data is updated and fill the ready-list with - // initial instructions. + // Ensure that all dependency data is updated (for nodes in the sub-graph) + // and fill the ready-list with initial instructions. int Idx = 0; - int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { - BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) { + TreeEntry *SDTE = getTreeEntry(SD->Inst); + (void)SDTE; assert((isVectorLikeInstWithConstOps(SD->Inst) || - SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) && + SD->isPartOfBundle() == + (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; - if (SD->isSchedulingEntity()) { + + if (SD->isSchedulingEntity() && SD->isPartOfBundle()) BS->calculateDependencies(SD, false, this); - NumToSchedule++; - } }); } BS->initialFillReadyList(ReadyInsts); @@ -7674,9 +9579,23 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { } BS->schedule(picked, ReadyInsts); - NumToSchedule--; } - assert(NumToSchedule == 0 && "could not schedule all instructions"); + + // Check that we didn't break any of our invariants. +#ifdef EXPENSIVE_CHECKS + BS->verify(); +#endif + +#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) + // Check that all schedulable entities got scheduled + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { + BS->doForAllOpcodes(I, [&](ScheduleData *SD) { + if (SD->isSchedulingEntity() && SD->hasValidDependencies()) { + assert(SD->IsScheduled && "must be scheduled at this point"); + } + }); + } +#endif // Avoid duplicate scheduling of the block. BS->ScheduleStart = nullptr; @@ -7686,11 +9605,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // If V is a store, just return the width of the stored value (or value // truncated just before storing) without traversing the expression tree. // This is the common case. - if (auto *Store = dyn_cast<StoreInst>(V)) { - if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) - return DL->getTypeSizeInBits(Trunc->getSrcTy()); + if (auto *Store = dyn_cast<StoreInst>(V)) return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); - } if (auto *IEI = dyn_cast<InsertElementInst>(V)) return getVectorElementSize(IEI->getOperand(1)); @@ -8092,6 +10008,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { + // Start new block - clear the list of reduction roots. + R.clearReductionData(); collectSeedInstructions(BB); // Vectorize trees that end at stores. @@ -8122,11 +10040,10 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, - unsigned Idx) { + unsigned Idx, unsigned MinVF) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); - const unsigned MinVF = R.getMinVecRegSize() / Sz; unsigned VF = Chain.size(); if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) @@ -8265,9 +10182,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, unsigned EltSize = R.getVectorElementSize(Operands[0]); unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); - unsigned MinVF = R.getMinVF(EltSize); unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); + auto *Store = cast<StoreInst>(Operands[0]); + Type *StoreTy = Store->getValueOperand()->getType(); + Type *ValueTy = StoreTy; + if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) + ValueTy = Trunc->getSrcTy(); + unsigned MinVF = TTI->getStoreMinimumVF( + R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? @@ -8277,7 +10200,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); if (!VectorizedStores.count(Slice.front()) && !VectorizedStores.count(Slice.back()) && - vectorizeStoreChain(Slice, R, Cnt)) { + vectorizeStoreChain(Slice, R, Cnt, MinVF)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Slice.begin(), Slice.end()); Changed = true; @@ -8481,7 +10404,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!I) return false; - if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) + if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) || + isa<VectorType>(I->getType())) return false; Value *P = I->getParent(); @@ -8492,32 +10416,40 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) return false; - // Try to vectorize V. - if (tryToVectorizePair(Op0, Op1, R)) - return true; + // First collect all possible candidates + SmallVector<std::pair<Value *, Value *>, 4> Candidates; + Candidates.emplace_back(Op0, Op1); auto *A = dyn_cast<BinaryOperator>(Op0); auto *B = dyn_cast<BinaryOperator>(Op1); // Try to skip B. - if (B && B->hasOneUse()) { + if (A && B && B->hasOneUse()) { auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); - if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) - return true; - if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) - return true; + if (B0 && B0->getParent() == P) + Candidates.emplace_back(A, B0); + if (B1 && B1->getParent() == P) + Candidates.emplace_back(A, B1); } - // Try to skip A. - if (A && A->hasOneUse()) { + if (B && A && A->hasOneUse()) { auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); - if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) - return true; - if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) - return true; + if (A0 && A0->getParent() == P) + Candidates.emplace_back(A0, B); + if (A1 && A1->getParent() == P) + Candidates.emplace_back(A1, B); } - return false; + + if (Candidates.size() == 1) + return tryToVectorizePair(Op0, Op1, R); + + // We have multiple options. Try to pick the single best. + Optional<int> BestCandidate = R.findBestRootPair(Candidates); + if (!BestCandidate) + return false; + return tryToVectorizePair(Candidates[*BestCandidate].first, + Candidates[*BestCandidate].second, R); } namespace { @@ -8552,15 +10484,16 @@ class HorizontalReduction { using ReductionOpsType = SmallVector<Value *, 16>; using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; ReductionOpsListType ReductionOps; - SmallVector<Value *, 32> ReducedVals; + /// List of possibly reduced values. + SmallVector<SmallVector<Value *>> ReducedVals; + /// Maps reduced value to the corresponding reduction operation. + DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps; // Use map vector to make stable output. MapVector<Instruction *, Value *> ExtraArgs; WeakTrackingVH ReductionRoot; /// The type of reduction operation. RecurKind RdxKind; - const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max(); - static bool isCmpSelMinMax(Instruction *I) { return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); @@ -8604,26 +10537,6 @@ class HorizontalReduction { return I->getOperand(Index); } - /// Checks if the ParentStackElem.first should be marked as a reduction - /// operation with an extra argument or as extra argument itself. - void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, - Value *ExtraArg) { - if (ExtraArgs.count(ParentStackElem.first)) { - ExtraArgs[ParentStackElem.first] = nullptr; - // We ran into something like: - // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. - // The whole ParentStackElem.first should be considered as an extra value - // in this case. - // Do not perform analysis of remaining operands of ParentStackElem.first - // instruction, this whole instruction is an extra argument. - ParentStackElem.second = INVALID_OPERAND_INDEX; - } else { - // We ran into something like: - // ParentStackElem.first += ... + ExtraArg + ... - ExtraArgs[ParentStackElem.first] = ExtraArg; - } - } - /// Creates reduction operation with the current opcode. static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, Value *RHS, const Twine &Name, bool UseSelect) { @@ -8682,7 +10595,7 @@ class HorizontalReduction { } /// Creates reduction operation with the current opcode with the IR flags - /// from \p ReductionOps. + /// from \p ReductionOps, dropping nuw/nsw flags. static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, const ReductionOpsListType &ReductionOps) { @@ -8696,31 +10609,21 @@ class HorizontalReduction { Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { if (auto *Sel = dyn_cast<SelectInst>(Op)) { - propagateIRFlags(Sel->getCondition(), ReductionOps[0]); - propagateIRFlags(Op, ReductionOps[1]); + propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr, + /*IncludeWrapFlags=*/false); + propagateIRFlags(Op, ReductionOps[1], nullptr, + /*IncludeWrapFlags=*/false); return Op; } } - propagateIRFlags(Op, ReductionOps[0]); - return Op; - } - - /// Creates reduction operation with the current opcode with the IR flags - /// from \p I. - static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, - Value *RHS, const Twine &Name, Instruction *I) { - auto *SelI = dyn_cast<SelectInst>(I); - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); - if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) - propagateIRFlags(Sel->getCondition(), SelI->getCondition()); - } - propagateIRFlags(Op, I); + propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false); return Op; } - static RecurKind getRdxKind(Instruction *I) { - assert(I && "Expected instruction for reduction matching"); + static RecurKind getRdxKind(Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + return RecurKind::None; if (match(I, m_Add(m_Value(), m_Value()))) return RecurKind::Add; if (match(I, m_Mul(m_Value(), m_Value()))) @@ -8882,7 +10785,9 @@ public: HorizontalReduction() = default; /// Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) { + bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst, + ScalarEvolution &SE, const DataLayout &DL, + const TargetLibraryInfo &TLI) { assert((!Phi || is_contained(Phi->operands(), Inst)) && "Phi needs to use the binary operator"); assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || @@ -8926,124 +10831,178 @@ public: ReductionRoot = Inst; - // The opcode for leaf values that we perform a reduction on. - // For example: load(x) + load(y) + load(z) + fptoui(w) - // The leaf opcode for 'w' does not match, so we don't include it as a - // potential candidate for the reduction. - unsigned LeafOpcode = 0; - - // Post-order traverse the reduction tree starting at Inst. We only handle - // true trees containing binary operators or selects. - SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; - Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst))); - initReductionOps(Inst); - while (!Stack.empty()) { - Instruction *TreeN = Stack.back().first; - unsigned EdgeToVisit = Stack.back().second++; - const RecurKind TreeRdxKind = getRdxKind(TreeN); - bool IsReducedValue = TreeRdxKind != RdxKind; - - // Postorder visit. - if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) { - if (IsReducedValue) - ReducedVals.push_back(TreeN); - else { - auto ExtraArgsIter = ExtraArgs.find(TreeN); - if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) { - // Check if TreeN is an extra argument of its parent operation. - if (Stack.size() <= 1) { - // TreeN can't be an extra argument as it is a root reduction - // operation. - return false; - } - // Yes, TreeN is an extra argument, do not add it to a list of - // reduction operations. - // Stack[Stack.size() - 2] always points to the parent operation. - markExtraArg(Stack[Stack.size() - 2], TreeN); - ExtraArgs.erase(TreeN); - } else - addReductionOps(TreeN); - } - // Retract. - Stack.pop_back(); - continue; - } - - // Visit operands. - Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit); - auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); - if (!EdgeInst) { - // Edge value is not a reduction instruction or a leaf instruction. - // (It may be a constant, function argument, or something else.) - markExtraArg(Stack.back(), EdgeVal); - continue; + // Iterate through all the operands of the possible reduction tree and + // gather all the reduced values, sorting them by their value id. + BasicBlock *BB = Inst->getParent(); + bool IsCmpSelMinMax = isCmpSelMinMax(Inst); + SmallVector<Instruction *> Worklist(1, Inst); + // Checks if the operands of the \p TreeN instruction are also reduction + // operations or should be treated as reduced values or an extra argument, + // which is not part of the reduction. + auto &&CheckOperands = [this, IsCmpSelMinMax, + BB](Instruction *TreeN, + SmallVectorImpl<Value *> &ExtraArgs, + SmallVectorImpl<Value *> &PossibleReducedVals, + SmallVectorImpl<Instruction *> &ReductionOps) { + for (int I = getFirstOperandIndex(TreeN), + End = getNumberOfOperands(TreeN); + I < End; ++I) { + Value *EdgeVal = getRdxOperand(TreeN, I); + ReducedValsToOps[EdgeVal].push_back(TreeN); + auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); + // Edge has wrong parent - mark as an extra argument. + if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) && + !hasSameParent(EdgeInst, BB)) { + ExtraArgs.push_back(EdgeVal); + continue; + } + // If the edge is not an instruction, or it is different from the main + // reduction opcode or has too many uses - possible reduced value. + if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || + IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || + !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || + !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) { + PossibleReducedVals.push_back(EdgeVal); + continue; + } + ReductionOps.push_back(EdgeInst); } - RecurKind EdgeRdxKind = getRdxKind(EdgeInst); - // Continue analysis if the next operand is a reduction operation or - // (possibly) a leaf value. If the leaf value opcode is not set, - // the first met operation != reduction operation is considered as the - // leaf opcode. - // Only handle trees in the current basic block. - // Each tree node needs to have minimal number of users except for the - // ultimate reduction. - const bool IsRdxInst = EdgeRdxKind == RdxKind; - if (EdgeInst != Phi && EdgeInst != Inst && - hasSameParent(EdgeInst, Inst->getParent()) && - hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) && - (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) { - if (IsRdxInst) { - // We need to be able to reassociate the reduction operations. - if (!isVectorizable(EdgeRdxKind, EdgeInst)) { - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), EdgeInst); - continue; - } - } else if (!LeafOpcode) { - LeafOpcode = EdgeInst->getOpcode(); + }; + // Try to regroup reduced values so that it gets more profitable to try to + // reduce them. Values are grouped by their value ids, instructions - by + // instruction op id and/or alternate op id, plus do extra analysis for + // loads (grouping them by the distabce between pointers) and cmp + // instructions (grouping them by the predicate). + MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>> + PossibleReducedVals; + initReductionOps(Inst); + while (!Worklist.empty()) { + Instruction *TreeN = Worklist.pop_back_val(); + SmallVector<Value *> Args; + SmallVector<Value *> PossibleRedVals; + SmallVector<Instruction *> PossibleReductionOps; + CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps); + // If too many extra args - mark the instruction itself as a reduction + // value, not a reduction operation. + if (Args.size() < 2) { + addReductionOps(TreeN); + // Add extra args. + if (!Args.empty()) { + assert(Args.size() == 1 && "Expected only single argument."); + ExtraArgs[TreeN] = Args.front(); } - Stack.push_back( - std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst))); - continue; + // Add reduction values. The values are sorted for better vectorization + // results. + for (Value *V : PossibleRedVals) { + size_t Key, Idx; + std::tie(Key, Idx) = generateKeySubkey( + V, &TLI, + [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) { + auto It = PossibleReducedVals.find(Key); + if (It != PossibleReducedVals.end()) { + for (const auto &LoadData : It->second) { + auto *RLI = cast<LoadInst>(LoadData.second.front().first); + if (getPointersDiff(RLI->getType(), + RLI->getPointerOperand(), LI->getType(), + LI->getPointerOperand(), DL, SE, + /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + } + return hash_value(LI->getPointerOperand()); + }, + /*AllowAlternate=*/false); + ++PossibleReducedVals[Key][Idx] + .insert(std::make_pair(V, 0)) + .first->second; + } + Worklist.append(PossibleReductionOps.rbegin(), + PossibleReductionOps.rend()); + } else { + size_t Key, Idx; + std::tie(Key, Idx) = generateKeySubkey( + TreeN, &TLI, + [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) { + auto It = PossibleReducedVals.find(Key); + if (It != PossibleReducedVals.end()) { + for (const auto &LoadData : It->second) { + auto *RLI = cast<LoadInst>(LoadData.second.front().first); + if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), + DL, SE, /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + } + return hash_value(LI->getPointerOperand()); + }, + /*AllowAlternate=*/false); + ++PossibleReducedVals[Key][Idx] + .insert(std::make_pair(TreeN, 0)) + .first->second; + } + } + auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); + // Sort values by the total number of values kinds to start the reduction + // from the longest possible reduced values sequences. + for (auto &PossibleReducedVals : PossibleReducedValsVect) { + auto PossibleRedVals = PossibleReducedVals.second.takeVector(); + SmallVector<SmallVector<Value *>> PossibleRedValsVect; + for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); + It != E; ++It) { + PossibleRedValsVect.emplace_back(); + auto RedValsVect = It->second.takeVector(); + stable_sort(RedValsVect, [](const auto &P1, const auto &P2) { + return P1.second < P2.second; + }); + for (const std::pair<Value *, unsigned> &Data : RedValsVect) + PossibleRedValsVect.back().append(Data.second, Data.first); } - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), EdgeInst); - } + stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { + return P1.size() > P2.size(); + }); + ReducedVals.emplace_back(); + for (ArrayRef<Value *> Data : PossibleRedValsVect) + ReducedVals.back().append(Data.rbegin(), Data.rend()); + } + // Sort the reduced values by number of same/alternate opcode and/or pointer + // operand. + stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) { + return P1.size() > P2.size(); + }); return true; } /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + constexpr int ReductionLimit = 4; + constexpr unsigned RegMaxNumber = 4; + constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < 4) + unsigned NumReducedVals = std::accumulate( + ReducedVals.begin(), ReducedVals.end(), 0, + [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); }); + if (NumReducedVals < ReductionLimit) return nullptr; - // Intersect the fast-math-flags from all reduction operations. - FastMathFlags RdxFMF; - RdxFMF.set(); - for (ReductionOpsType &RdxOp : ReductionOps) { - for (Value *RdxVal : RdxOp) { - if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) - RdxFMF &= FPMO->getFastMathFlags(); - } - } - IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); - Builder.setFastMathFlags(RdxFMF); + // Track the reduced values in case if they are replaced by extractelement + // because of the vectorization. + DenseMap<Value *, WeakTrackingVH> TrackedVals; BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; // The same extra argument may be used several times, so log each attempt // to use it. for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); + TrackedVals.try_emplace(Pair.second, Pair.second); } // The compare instruction of a min/max is the insertion point for new // instructions and may be replaced with a new compare instruction. - auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { + auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { assert(isa<SelectInst>(RdxRootInst) && "Expected min/max reduction to have select root instruction"); Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); @@ -9055,164 +11014,390 @@ public: // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; - SmallVector<Value *, 16> IgnoreList; - for (ReductionOpsType &RdxOp : ReductionOps) - IgnoreList.append(RdxOp.begin(), RdxOp.end()); - - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - if (NumReducedVals > ReduxWidth) { - // In the loop below, we are building a tree based on a window of - // 'ReduxWidth' values. - // If the operands of those values have common traits (compare predicate, - // constant operand, etc), then we want to group those together to - // minimize the cost of the reduction. - - // TODO: This should be extended to count common operands for - // compares and binops. - - // Step 1: Count the number of times each compare predicate occurs. - SmallDenseMap<unsigned, unsigned> PredCountMap; - for (Value *RdxVal : ReducedVals) { - CmpInst::Predicate Pred; - if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) - ++PredCountMap[Pred]; - } - // Step 2: Sort the values so the most common predicates come first. - stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { - CmpInst::Predicate PredA, PredB; - if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && - match(B, m_Cmp(PredB, m_Value(), m_Value()))) { - return PredCountMap[PredA] > PredCountMap[PredB]; - } - return false; - }); - } + SmallDenseSet<Value *> IgnoreList; + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) { + if (!RdxOp) + continue; + IgnoreList.insert(RdxOp); + } + bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot)); + + // Need to track reduced vals, they may be changed during vectorization of + // subvectors. + for (ArrayRef<Value *> Candidates : ReducedVals) + for (Value *V : Candidates) + TrackedVals.try_emplace(V, V); + DenseMap<Value *, unsigned> VectorizedVals; Value *VectorizedTree = nullptr; - unsigned i = 0; - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { - ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, IgnoreList); - if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) - break; - if (V.isLoadCombineReductionCandidate(RdxKind)) - break; - V.reorderTopToBottom(); - V.reorderBottomToTop(/*IgnoreReorder=*/true); - V.buildExternalUses(ExternallyUsedValues); - - // For a poison-safe boolean logic reduction, do not replace select - // instructions with logic ops. All reduced values will be frozen (see - // below) to prevent leaking poison. - if (isa<SelectInst>(ReductionRoot) && - isBoolLogicOp(cast<Instruction>(ReductionRoot)) && - NumReducedVals != ReduxWidth) - break; + bool CheckForReusedReductionOps = false; + // Try to vectorize elements based on their type. + for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { + ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; + InstructionsState S = getSameOpcode(OrigReducedVals); + SmallVector<Value *> Candidates; + DenseMap<Value *, Value *> TrackedToOrig; + for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { + Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; + // Check if the reduction value was not overriden by the extractelement + // instruction because of the vectorization and exclude it, if it is not + // compatible with other values. + if (auto *Inst = dyn_cast<Instruction>(RdxVal)) + if (isVectorLikeInstWithConstOps(Inst) && + (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) + continue; + Candidates.push_back(RdxVal); + TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); + } + bool ShuffledExtracts = false; + // Try to handle shuffled extractelements. + if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && + I + 1 < E) { + InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]); + if (NextS.getOpcode() == Instruction::ExtractElement && + !NextS.isAltShuffle()) { + SmallVector<Value *> CommonCandidates(Candidates); + for (Value *RV : ReducedVals[I + 1]) { + Value *RdxVal = TrackedVals.find(RV)->second; + // Check if the reduction value was not overriden by the + // extractelement instruction because of the vectorization and + // exclude it, if it is not compatible with other values. + if (auto *Inst = dyn_cast<Instruction>(RdxVal)) + if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst)) + continue; + CommonCandidates.push_back(RdxVal); + TrackedToOrig.try_emplace(RdxVal, RV); + } + SmallVector<int> Mask; + if (isFixedVectorShuffle(CommonCandidates, Mask)) { + ++I; + Candidates.swap(CommonCandidates); + ShuffledExtracts = true; + } + } + } + unsigned NumReducedVals = Candidates.size(); + if (NumReducedVals < ReductionLimit) + continue; - V.computeMinimumValueSizes(); + unsigned MaxVecRegSize = V.getMaxVecRegSize(); + unsigned EltSize = V.getVectorElementSize(Candidates[0]); + unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); + + unsigned ReduxWidth = std::min<unsigned>( + PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); + unsigned Start = 0; + unsigned Pos = Start; + // Restarts vectorization attempt with lower vector factor. + unsigned PrevReduxWidth = ReduxWidth; + bool CheckForReusedReductionOpsLocal = false; + auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, + &CheckForReusedReductionOpsLocal, + &PrevReduxWidth, &V, + &IgnoreList](bool IgnoreVL = false) { + bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); + if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { + // Check if any of the reduction ops are gathered. If so, worth + // trying again with less number of reduction ops. + CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered; + } + ++Pos; + if (Pos < NumReducedVals - ReduxWidth + 1) + return IsAnyRedOpGathered; + Pos = Start; + ReduxWidth /= 2; + return IsAnyRedOpGathered; + }; + while (Pos < NumReducedVals - ReduxWidth + 1 && + ReduxWidth >= ReductionLimit) { + // Dependency in tree of the reduction ops - drop this attempt, try + // later. + if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && + Start == 0) { + CheckForReusedReductionOps = true; + break; + } + PrevReduxWidth = ReduxWidth; + ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth); + // Beeing analyzed already - skip. + if (V.areAnalyzedReductionVals(VL)) { + (void)AdjustReducedVals(/*IgnoreVL=*/true); + continue; + } + // Early exit if any of the reduction values were deleted during + // previous vectorization attempts. + if (any_of(VL, [&V](Value *RedVal) { + auto *RedValI = dyn_cast<Instruction>(RedVal); + if (!RedValI) + return false; + return V.isDeleted(RedValI); + })) + break; + V.buildTree(VL, IgnoreList); + if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { + if (!AdjustReducedVals()) + V.analyzedReductionVals(VL); + continue; + } + if (V.isLoadCombineReductionCandidate(RdxKind)) { + if (!AdjustReducedVals()) + V.analyzedReductionVals(VL); + continue; + } + V.reorderTopToBottom(); + // No need to reorder the root node at all. + V.reorderBottomToTop(/*IgnoreReorder=*/true); + // Keep extracted other reduction values, if they are used in the + // vectorization trees. + BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( + ExternallyUsedValues); + for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { + if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) + continue; + for_each(ReducedVals[Cnt], + [&LocalExternallyUsedValues, &TrackedVals](Value *V) { + if (isa<Instruction>(V)) + LocalExternallyUsedValues[TrackedVals[V]]; + }); + } + // Number of uses of the candidates in the vector of values. + SmallDenseMap<Value *, unsigned> NumUses; + for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { + Value *V = Candidates[Cnt]; + if (NumUses.count(V) > 0) + continue; + NumUses[V] = std::count(VL.begin(), VL.end(), V); + } + for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { + Value *V = Candidates[Cnt]; + if (NumUses.count(V) > 0) + continue; + NumUses[V] = std::count(VL.begin(), VL.end(), V); + } + // Gather externally used values. + SmallPtrSet<Value *, 4> Visited; + for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { + Value *V = Candidates[Cnt]; + if (!Visited.insert(V).second) + continue; + unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; + if (NumOps != ReducedValsToOps.find(V)->second.size()) + LocalExternallyUsedValues[V]; + } + for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { + Value *V = Candidates[Cnt]; + if (!Visited.insert(V).second) + continue; + unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; + if (NumOps != ReducedValsToOps.find(V)->second.size()) + LocalExternallyUsedValues[V]; + } + V.buildExternalUses(LocalExternallyUsedValues); + + V.computeMinimumValueSizes(); + + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (Value *U : IgnoreList) + if (auto *FPMO = dyn_cast<FPMathOperator>(U)) + RdxFMF &= FPMO->getFastMathFlags(); + // Estimate cost. + InstructionCost TreeCost = V.getTreeCost(VL); + InstructionCost ReductionCost = + getReductionCost(TTI, VL, ReduxWidth, RdxFMF); + InstructionCost Cost = TreeCost + ReductionCost; + if (!Cost.isValid()) { + LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); + return nullptr; + } + if (Cost >= -SLPCostThreshold) { + V.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "HorSLPNotBeneficial", + ReducedValsToOps.find(VL[0])->second.front()) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + if (!AdjustReducedVals()) + V.analyzedReductionVals(VL); + continue; + } - // Estimate cost. - InstructionCost TreeCost = - V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); - InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); - InstructionCost Cost = TreeCost + ReductionCost; - if (!Cost.isValid()) { - LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); - return nullptr; - } - if (Cost >= -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" + << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", - cast<Instruction>(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " << ore::NV("Cost", Cost) - << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); + return OptimizationRemark( + SV_NAME, "VectorizedHorizontalReduction", + ReducedValsToOps.find(VL[0])->second.front()) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); }); - break; - } - LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" - << Cost << ". (HorRdx)\n"); - V.getORE()->emit([&]() { - return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", - cast<Instruction>(VL[0])) - << "Vectorized horizontal reduction with cost " - << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize()); - }); + Builder.setFastMathFlags(RdxFMF); - // Vectorize a tree. - DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); - Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); + // Vectorize a tree. + Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues); - // Emit a reduction. If the root is a select (min/max idiom), the insert - // point is the compare condition of that select. - Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); - if (isCmpSelMinMax(RdxRootInst)) - Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); - else - Builder.SetInsertPoint(RdxRootInst); + // Emit a reduction. If the root is a select (min/max idiom), the insert + // point is the compare condition of that select. + Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); + if (IsCmpSelMinMax) + Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst)); + else + Builder.SetInsertPoint(RdxRootInst); - // To prevent poison from leaking across what used to be sequential, safe, - // scalar boolean logic operations, the reduction operand must be frozen. - if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) - VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + // To prevent poison from leaking across what used to be sequential, + // safe, scalar boolean logic operations, the reduction operand must be + // frozen. + if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) + VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); - Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); + Value *ReducedSubTree = + emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - if (!VectorizedTree) { - // Initialize the final value in the reduction. - VectorizedTree = ReducedSubTree; - } else { - // Update the final value in the reduction. - Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - ReducedSubTree, "op.rdx", ReductionOps); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = ReducedSubTree; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + ReducedSubTree, "op.rdx", ReductionOps); + } + // Count vectorized reduced values to exclude them from final reduction. + for (Value *V : VL) + ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0) + .first->getSecond(); + Pos += ReduxWidth; + Start = Pos; + ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); } - i += ReduxWidth; - ReduxWidth = PowerOf2Floor(NumReducedVals - i); } - if (VectorizedTree) { // Finish the reduction. - for (; i < NumReducedVals; ++i) { - auto *I = cast<Instruction>(ReducedVals[i]); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = - createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); + // Need to add extra arguments and not vectorized possible reduction + // values. + // Try to avoid dependencies between the scalar remainders after + // reductions. + auto &&FinalGen = + [this, &Builder, + &TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) { + unsigned Sz = InstVals.size(); + SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + + Sz % 2); + for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) { + Instruction *RedOp = InstVals[I + 1].first; + Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); + Value *RdxVal1 = InstVals[I].second; + Value *StableRdxVal1 = RdxVal1; + auto It1 = TrackedVals.find(RdxVal1); + if (It1 != TrackedVals.end()) + StableRdxVal1 = It1->second; + Value *RdxVal2 = InstVals[I + 1].second; + Value *StableRdxVal2 = RdxVal2; + auto It2 = TrackedVals.find(RdxVal2); + if (It2 != TrackedVals.end()) + StableRdxVal2 = It2->second; + Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, + StableRdxVal2, "op.rdx", ReductionOps); + ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); + } + if (Sz % 2 == 1) + ExtraReds[Sz / 2] = InstVals.back(); + return ExtraReds; + }; + SmallVector<std::pair<Instruction *, Value *>> ExtraReductions; + SmallPtrSet<Value *, 8> Visited; + for (ArrayRef<Value *> Candidates : ReducedVals) { + for (Value *RdxVal : Candidates) { + if (!Visited.insert(RdxVal).second) + continue; + unsigned NumOps = VectorizedVals.lookup(RdxVal); + for (Instruction *RedOp : + makeArrayRef(ReducedValsToOps.find(RdxVal)->second) + .drop_back(NumOps)) + ExtraReductions.emplace_back(RedOp, RdxVal); + } } for (auto &Pair : ExternallyUsedValues) { // Add each externally used value to the final reduction. - for (auto *I : Pair.second) { - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - Pair.first, "op.extra", I); - } + for (auto *I : Pair.second) + ExtraReductions.emplace_back(I, Pair.first); + } + // Iterate through all not-vectorized reduction values/extra arguments. + while (ExtraReductions.size() > 1) { + SmallVector<std::pair<Instruction *, Value *>> NewReds = + FinalGen(ExtraReductions); + ExtraReductions.swap(NewReds); + } + // Final reduction. + if (ExtraReductions.size() == 1) { + Instruction *RedOp = ExtraReductions.back().first; + Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); + Value *RdxVal = ExtraReductions.back().second; + Value *StableRdxVal = RdxVal; + auto It = TrackedVals.find(RdxVal); + if (It != TrackedVals.end()) + StableRdxVal = It->second; + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + StableRdxVal, "op.rdx", ReductionOps); } ReductionRoot->replaceAllUsesWith(VectorizedTree); - // Mark all scalar reduction ops for deletion, they are replaced by the - // vector reductions. - V.eraseInstructions(IgnoreList); + // The original scalar reduction is expected to have no remaining + // uses outside the reduction tree itself. Assert that we got this + // correct, replace internal uses with undef, and mark for eventual + // deletion. +#ifndef NDEBUG + SmallSet<Value *, 4> IgnoreSet; + for (ArrayRef<Value *> RdxOps : ReductionOps) + IgnoreSet.insert(RdxOps.begin(), RdxOps.end()); +#endif + for (ArrayRef<Value *> RdxOps : ReductionOps) { + for (Value *Ignore : RdxOps) { + if (!Ignore) + continue; +#ifndef NDEBUG + for (auto *U : Ignore->users()) { + assert(IgnoreSet.count(U) && + "All users must be either in the reduction ops list."); + } +#endif + if (!Ignore->use_empty()) { + Value *Undef = UndefValue::get(Ignore->getType()); + Ignore->replaceAllUsesWith(Undef); + } + V.eraseInstruction(cast<Instruction>(Ignore)); + } + } + } else if (!CheckForReusedReductionOps) { + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) + V.analyzedReductionRoot(cast<Instruction>(RdxOp)); } return VectorizedTree; } - unsigned numReductionValues() const { return ReducedVals.size(); } - private: /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, - Value *FirstReducedVal, unsigned ReduxWidth, - FastMathFlags FMF) { + ArrayRef<Value *> ReducedVals, + unsigned ReduxWidth, FastMathFlags FMF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Value *FirstReducedVal = ReducedVals.front(); Type *ScalarTy = FirstReducedVal->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); - InstructionCost VectorCost, ScalarCost; + InstructionCost VectorCost = 0, ScalarCost; + // If all of the reduced values are constant, the vector cost is 0, since + // the reduction value can be calculated at the compile time. + bool AllConsts = all_of(ReducedVals, isConstant); switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -9222,17 +11407,22 @@ private: case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = - TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); + if (!AllConsts) + VectorCost = + TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); break; } case RecurKind::FMax: case RecurKind::FMin: { auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); - auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*IsUnsigned=*/false, CostKind); + if (!AllConsts) { + auto *VecCondTy = + cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); + VectorCost = + TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*IsUnsigned=*/false, CostKind); + } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -9245,11 +11435,14 @@ private: case RecurKind::UMax: case RecurKind::UMin: { auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); - auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - bool IsUnsigned = - RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned, - CostKind); + if (!AllConsts) { + auto *VecCondTy = + cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); + bool IsUnsigned = + RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + IsUnsigned, CostKind); + } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -9463,7 +11656,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { /// performed. static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, + TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL, + const TargetLibraryInfo &TLI, const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { if (!ShouldVectorizeHor) return false; @@ -9482,7 +11676,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - // Skip the analysis of CmpInsts.Compiler implements postanalysis of the + // Skip the analysis of CmpInsts. Compiler implements postanalysis of the // CmpInsts so we can skip extra attempts in // tryToVectorizeHorReductionOrInstOperands and save compile time. std::queue<std::pair<Instruction *, unsigned>> Stack; @@ -9490,13 +11684,16 @@ static bool tryToVectorizeHorReductionOrInstOperands( SmallPtrSet<Value *, 8> VisitedInstrs; SmallVector<WeakTrackingVH> PostponedInsts; bool Res = false; - auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0, - Value *&B1) -> Value * { + auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst, + Value *&B0, + Value *&B1) -> Value * { + if (R.isAnalyzedReductionRoot(Inst)) + return nullptr; bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); if (IsBinop || IsSelect) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst)) + if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI)) return HorRdx.tryToReduce(R, TTI); } return nullptr; @@ -9541,7 +11738,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Do not try to vectorize CmpInst operands, this is done separately. // Final attempt for binop args vectorization should happen after the loop // to try to find reductions. - if (!isa<CmpInst>(Inst)) + if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst)) PostponedInsts.push_back(Inst); } @@ -9554,8 +11751,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( if (auto *I = dyn_cast<Instruction>(Op)) // Do not try to vectorize CmpInst operands, this is done // separately. - if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) && - I->getParent() == BB) + if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) && + !R.isDeleted(I) && I->getParent() == BB) Stack.emplace(I, Level); } // Try to vectorized binops where reductions were not found. @@ -9579,8 +11776,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { return tryToVectorize(I, R); }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, - ExtraVectorization); + return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL, + *TLI, ExtraVectorization); } bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, @@ -9748,12 +11945,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; - if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (isa<CmpInst>(I)) + } else if (isa<CmpInst>(I)) { PostponedCmps.push_back(I); + continue; + } + // Try to find reductions in buildvector sequnces. + OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI); } if (AtTerminator) { // Try to find reductions first. @@ -10171,7 +12372,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); assert(NodeI1 && "Should only process reachable instructions"); - assert(NodeI1 && "Should only process reachable instructions"); + assert(NodeI2 && "Should only process reachable instructions"); assert((NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 8822c0004eb2..97f2b1a93815 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -72,17 +72,17 @@ class VPRecipeBuilder { VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlanPtr &Plan); - /// Check if an induction recipe should be constructed for \I. If so build and - /// return it. If not, return null. - VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands, - VFRange &Range) const; + /// Check if an induction recipe should be constructed for \p Phi. If so build + /// and return it. If not, return null. + VPRecipeBase *tryToOptimizeInductionPHI(PHINode *Phi, + ArrayRef<VPValue *> Operands, + VPlan &Plan, VFRange &Range); /// Optimize the special case where the operand of \p I is a constant integer /// induction variable. VPWidenIntOrFpInductionRecipe * tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands, - VFRange &Range, VPlan &Plan) const; + VFRange &Range, VPlan &Plan); /// Handle non-loop phi nodes. Return a VPValue, if all incoming values match /// or a new VPBlendRecipe otherwise. Currently all such phi nodes are turned diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 342d4a074e10..4d709097c306 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -23,11 +23,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" -#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -35,13 +34,13 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTreeConstruction.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> -#include <iterator> #include <string> #include <vector> @@ -60,7 +59,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { } #endif -Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder, +Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const { switch (LaneKind) { case VPLane::Kind::ScalableLast: @@ -158,25 +157,25 @@ void VPBlockBase::setPlan(VPlan *ParentPlan) { } /// \return the VPBasicBlock that is the exit of Block, possibly indirectly. -const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { +const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const { const VPBlockBase *Block = this; while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) - Block = Region->getExit(); + Block = Region->getExiting(); return cast<VPBasicBlock>(Block); } -VPBasicBlock *VPBlockBase::getExitBasicBlock() { +VPBasicBlock *VPBlockBase::getExitingBasicBlock() { VPBlockBase *Block = this; while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) - Block = Region->getExit(); + Block = Region->getExiting(); return cast<VPBasicBlock>(Block); } VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { if (!Successors.empty() || !Parent) return this; - assert(Parent->getExit() == this && - "Block w/o successors not the exit of its parent."); + assert(Parent->getExiting() == this && + "Block w/o successors not the exiting block of its parent."); return Parent->getEnclosingBlockWithSuccessors(); } @@ -188,28 +187,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -VPValue *VPBlockBase::getCondBit() { - return CondBitUser.getSingleOperandOrNull(); -} - -const VPValue *VPBlockBase::getCondBit() const { - return CondBitUser.getSingleOperandOrNull(); -} - -void VPBlockBase::setCondBit(VPValue *CV) { CondBitUser.resetSingleOpUser(CV); } - -VPValue *VPBlockBase::getPredicate() { - return PredicateUser.getSingleOperandOrNull(); -} - -const VPValue *VPBlockBase::getPredicate() const { - return PredicateUser.getSingleOperandOrNull(); -} - -void VPBlockBase::setPredicate(VPValue *CV) { - PredicateUser.resetSingleOpUser(CV); -} - void VPBlockBase::deleteCFG(VPBlockBase *Entry) { SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry)); @@ -245,6 +222,52 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { // set(Def, Extract, Instance); return Extract; } +BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) { + VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion(); + return VPBB2IRBB[LoopRegion->getPreheaderVPBB()]; +} + +void VPTransformState::addNewMetadata(Instruction *To, + const Instruction *Orig) { + // If the loop was versioned with memchecks, add the corresponding no-alias + // metadata. + if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) + LVer->annotateInstWithNoAlias(To, Orig); +} + +void VPTransformState::addMetadata(Instruction *To, Instruction *From) { + propagateMetadata(To, From); + addNewMetadata(To, From); +} + +void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { + for (Value *V : To) { + if (Instruction *I = dyn_cast<Instruction>(V)) + addMetadata(I, From); + } +} + +void VPTransformState::setDebugLocFromInst(const Value *V) { + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { + const DILocation *DIL = Inst->getDebugLoc(); + + // When a FSDiscriminator is enabled, we don't need to add the multiply + // factors to the discriminators. + if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && + !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + // FIXME: For scalable vectors, assume vscale=1. + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); + if (NewDIL) + Builder.SetCurrentDebugLocation(*NewDIL); + else + LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " + << DIL->getFilename() << " Line: " << DIL->getLine()); + } else + Builder.SetCurrentDebugLocation(DIL); + } else + Builder.SetCurrentDebugLocation(DebugLoc()); +} BasicBlock * VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { @@ -252,43 +275,36 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // Pred stands for Predessor. Prev stands for Previous - last visited/created. BasicBlock *PrevBB = CFG.PrevBB; BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), - PrevBB->getParent(), CFG.LastBB); + PrevBB->getParent(), CFG.ExitBB); LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); - auto &PredVPSuccessors = PredVPBB->getSuccessors(); + VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); + auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; - // In outer loop vectorization scenario, the predecessor BBlock may not yet - // be visited(backedge). Mark the VPBasicBlock for fixup at the end of - // vectorization. We do not encounter this case in inner loop vectorization - // as we start out by building a loop skeleton with the vector loop header - // and latch blocks. As a result, we never enter this function for the - // header block in the non VPlan-native path. - if (!PredBB) { - assert(EnableVPlanNativePath && - "Unexpected null predecessor in non VPlan-native path"); - CFG.VPBBsToFix.push_back(PredVPBB); - continue; - } - assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); + + auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator); if (isa<UnreachableInst>(PredBBTerminator)) { assert(PredVPSuccessors.size() == 1 && "Predecessor ending w/o branch must have single successor."); + DebugLoc DL = PredBBTerminator->getDebugLoc(); PredBBTerminator->eraseFromParent(); - BranchInst::Create(NewBB, PredBB); + auto *Br = BranchInst::Create(NewBB, PredBB); + Br->setDebugLoc(DL); + } else if (TermBr && !TermBr->isConditional()) { + TermBr->setSuccessor(0, NewBB); } else { - assert(PredVPSuccessors.size() == 2 && - "Predecessor ending with branch must have two successors."); + // Set each forward successor here when it is created, excluding + // backedges. A backward successor is set when the branch is created. unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!PredBBTerminator->getSuccessor(idx) && + assert(!TermBr->getSuccessor(idx) && "Trying to reset an existing successor block."); - PredBBTerminator->setSuccessor(idx, NewBB); + TermBr->setSuccessor(idx, NewBB); } } return NewBB; @@ -300,27 +316,51 @@ void VPBasicBlock::execute(VPTransformState *State) { VPBlockBase *SingleHPred = nullptr; BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. - // 1. Create an IR basic block, or reuse the last one if possible. - // The last IR basic block is reused, as an optimization, in three cases: - // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; - // B. when the current VPBB has a single (hierarchical) predecessor which - // is PrevVPBB and the latter has a single (hierarchical) successor; and - // C. when the current VPBB is an entry of a region replica - where PrevVPBB - // is the exit of this region from a previous instance, or the predecessor - // of this region. - if (PrevVPBB && /* A */ - !((SingleHPred = getSingleHierarchicalPredecessor()) && - SingleHPred->getExitBasicBlock() == PrevVPBB && - PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ - !(Replica && getPredecessors().empty())) { /* C */ + auto IsLoopRegion = [](VPBlockBase *BB) { + auto *R = dyn_cast<VPRegionBlock>(BB); + return R && !R->isReplicator(); + }; + + // 1. Create an IR basic block, or reuse the last one or ExitBB if possible. + if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) { + // ExitBB can be re-used for the exit block of the Plan. + NewBB = State->CFG.ExitBB; + State->CFG.PrevBB = NewBB; + + // Update the branch instruction in the predecessor to branch to ExitBB. + VPBlockBase *PredVPB = getSingleHierarchicalPredecessor(); + VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock(); + assert(PredVPB->getSingleSuccessor() == this && + "predecessor must have the current block as only successor"); + BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB]; + // The Exit block of a loop is always set to be successor 0 of the Exiting + // block. + cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB); + } else if (PrevVPBB && /* A */ + !((SingleHPred = getSingleHierarchicalPredecessor()) && + SingleHPred->getExitingBasicBlock() == PrevVPBB && + PrevVPBB->getSingleHierarchicalSuccessor() && + (SingleHPred->getParent() == getEnclosingLoopRegion() && + !IsLoopRegion(SingleHPred))) && /* B */ + !(Replica && getPredecessors().empty())) { /* C */ + // The last IR basic block is reused, as an optimization, in three cases: + // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null; + // B. when the current VPBB has a single (hierarchical) predecessor which + // is PrevVPBB and the latter has a single (hierarchical) successor which + // both are in the same non-replicator region; and + // C. when the current VPBB is an entry of a region replica - where PrevVPBB + // is the exiting VPBB of this region from a previous instance, or the + // predecessor of this region. + NewBB = createEmptyBasicBlock(State->CFG); State->Builder.SetInsertPoint(NewBB); // Temporarily terminate with unreachable until CFG is rewired. UnreachableInst *Terminator = State->Builder.CreateUnreachable(); + // Register NewBB in its loop. In innermost loops its the same for all + // BB's. + if (State->CurrentVectorLoop) + State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); - // Register NewBB in its loop. In innermost loops its the same for all BB's. - Loop *L = State->LI->getLoopFor(State->CFG.LastBB); - L->addBasicBlockToLoop(NewBB, *State->LI); State->CFG.PrevBB = NewBB; } @@ -334,29 +374,6 @@ void VPBasicBlock::execute(VPTransformState *State) { for (VPRecipeBase &Recipe : Recipes) Recipe.execute(*State); - VPValue *CBV; - if (EnableVPlanNativePath && (CBV = getCondBit())) { - assert(CBV->getUnderlyingValue() && - "Unexpected null underlying value for condition bit"); - - // Condition bit value in a VPBasicBlock is used as the branch selector. In - // the VPlan-native path case, since all branches are uniform we generate a - // branch instruction using the condition value from vector lane 0 and dummy - // successors. The successors are fixed later when the successor blocks are - // visited. - Value *NewCond = State->get(CBV, {0, 0}); - - // Replace the temporary unreachable terminator with the new conditional - // branch. - auto *CurrentTerminator = NewBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional " - "branch."); - auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); - } - LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } @@ -395,6 +412,61 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { return SplitBlock; } +VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() { + VPRegionBlock *P = getParent(); + if (P && P->isReplicator()) { + P = P->getParent(); + assert(!cast<VPRegionBlock>(P)->isReplicator() && + "unexpected nested replicate regions"); + } + return P; +} + +static bool hasConditionalTerminator(const VPBasicBlock *VPBB) { + if (VPBB->empty()) { + assert( + VPBB->getNumSuccessors() < 2 && + "block with multiple successors doesn't have a recipe as terminator"); + return false; + } + + const VPRecipeBase *R = &VPBB->back(); + auto *VPI = dyn_cast<VPInstruction>(R); + bool IsCondBranch = + isa<VPBranchOnMaskRecipe>(R) || + (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond || + VPI->getOpcode() == VPInstruction::BranchOnCount)); + (void)IsCondBranch; + + if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) { + assert(IsCondBranch && "block with multiple successors not terminated by " + "conditional branch recipe"); + + return true; + } + + assert( + !IsCondBranch && + "block with 0 or 1 successors terminated by conditional branch recipe"); + return false; +} + +VPRecipeBase *VPBasicBlock::getTerminator() { + if (hasConditionalTerminator(this)) + return &back(); + return nullptr; +} + +const VPRecipeBase *VPBasicBlock::getTerminator() const { + if (hasConditionalTerminator(this)) + return &back(); + return nullptr; +} + +bool VPBasicBlock::isExiting() const { + return getParent()->getExitingBasicBlock() == this; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const { if (getSuccessors().empty()) { @@ -411,13 +483,6 @@ void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const { void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << getName() << ":\n"; - if (const VPValue *Pred = getPredicate()) { - O << Indent << "BlockPredicate:"; - Pred->printAsOperand(O, SlotTracker); - if (const auto *PredInst = dyn_cast<VPInstruction>(Pred)) - O << " (" << PredInst->getParent()->getName() << ")"; - O << '\n'; - } auto RecipeIndent = Indent + " "; for (const VPRecipeBase &Recipe : *this) { @@ -426,14 +491,6 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, } printSuccessors(O, Indent); - - if (const VPValue *CBV = getCondBit()) { - O << Indent << "CondBit: "; - CBV->printAsOperand(O, SlotTracker); - if (const auto *CBI = dyn_cast<VPInstruction>(CBV)) - O << " (" << CBI->getParent()->getName() << ")"; - O << '\n'; - } } #endif @@ -448,25 +505,26 @@ void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); if (!isReplicator()) { + // Create and register the new vector loop. + Loop *PrevLoop = State->CurrentVectorLoop; + State->CurrentVectorLoop = State->LI->AllocateLoop(); + BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()]; + Loop *ParentLoop = State->LI->getLoopFor(VectorPH); + + // Insert the new loop into the loop nest and register the new basic blocks + // before calling any utilities such as SCEV that require valid LoopInfo. + if (ParentLoop) + ParentLoop->addChildLoop(State->CurrentVectorLoop); + else + State->LI->addTopLevelLoop(State->CurrentVectorLoop); + // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { - if (EnableVPlanNativePath) { - // The inner loop vectorization path does not represent loop preheader - // and exit blocks as part of the VPlan. In the VPlan-native path, skip - // vectorizing loop preheader block. In future, we may replace this - // check with the check for loop preheader. - if (Block->getNumPredecessors() == 0) - continue; - - // Skip vectorizing loop exit block. In future, we may replace this - // check with the check for loop exit. - if (Block->getNumSuccessors() == 0) - continue; - } - LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } + + State->CurrentVectorLoop = PrevLoop; return; } @@ -508,341 +566,32 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif -bool VPRecipeBase::mayWriteToMemory() const { - switch (getVPDefID()) { - case VPWidenMemoryInstructionSC: { - return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); - } - case VPReplicateSC: - case VPWidenCallSC: - return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) - ->mayWriteToMemory(); - case VPBranchOnMaskSC: - return false; - case VPWidenIntOrFpInductionSC: - case VPWidenCanonicalIVSC: - case VPWidenPHISC: - case VPBlendSC: - case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayWriteToMemory()) && - "underlying instruction may write to memory"); - return false; - } - default: - return true; - } -} - -bool VPRecipeBase::mayReadFromMemory() const { - switch (getVPDefID()) { - case VPWidenMemoryInstructionSC: { - return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); - } - case VPReplicateSC: - case VPWidenCallSC: - return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) - ->mayReadFromMemory(); - case VPBranchOnMaskSC: - return false; - case VPWidenIntOrFpInductionSC: - case VPWidenCanonicalIVSC: - case VPWidenPHISC: - case VPBlendSC: - case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayReadFromMemory()) && - "underlying instruction may read from memory"); - return false; - } - default: - return true; - } -} - -bool VPRecipeBase::mayHaveSideEffects() const { - switch (getVPDefID()) { - case VPBranchOnMaskSC: - return false; - case VPWidenIntOrFpInductionSC: - case VPWidenCanonicalIVSC: - case VPWidenPHISC: - case VPBlendSC: - case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayHaveSideEffects()) && - "underlying instruction has side-effects"); - return false; - } - case VPReplicateSC: { - auto *R = cast<VPReplicateRecipe>(this); - return R->getUnderlyingInstr()->mayHaveSideEffects(); - } - default: - return true; - } -} - -void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(InsertPos->getParent() && - "Insertion position not in any VPBasicBlock"); - Parent = InsertPos->getParent(); - Parent->getRecipeList().insert(InsertPos->getIterator(), this); -} - -void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(InsertPos->getParent() && - "Insertion position not in any VPBasicBlock"); - Parent = InsertPos->getParent(); - Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); -} - -void VPRecipeBase::removeFromParent() { - assert(getParent() && "Recipe not in any VPBasicBlock"); - getParent()->getRecipeList().remove(getIterator()); - Parent = nullptr; -} - -iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { - assert(getParent() && "Recipe not in any VPBasicBlock"); - return getParent()->getRecipeList().erase(getIterator()); -} - -void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { - removeFromParent(); - insertAfter(InsertPos); -} - -void VPRecipeBase::moveBefore(VPBasicBlock &BB, - iplist<VPRecipeBase>::iterator I) { - assert(I == BB.end() || I->getParent() == &BB); - removeFromParent(); - Parent = &BB; - BB.getRecipeList().insert(I, this); -} - -void VPInstruction::generateInstruction(VPTransformState &State, - unsigned Part) { - IRBuilder<> &Builder = State.Builder; - Builder.SetCurrentDebugLocation(DL); - - if (Instruction::isBinaryOp(getOpcode())) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); - State.set(this, V, Part); - return; - } - - switch (getOpcode()) { - case VPInstruction::Not: { - Value *A = State.get(getOperand(0), Part); - Value *V = Builder.CreateNot(A); - State.set(this, V, Part); - break; - } - case VPInstruction::ICmpULE: { - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *V = Builder.CreateICmpULE(IV, TC); - State.set(this, V, Part); - break; - } - case Instruction::Select: { - Value *Cond = State.get(getOperand(0), Part); - Value *Op1 = State.get(getOperand(1), Part); - Value *Op2 = State.get(getOperand(2), Part); - Value *V = Builder.CreateSelect(Cond, Op1, Op2); - State.set(this, V, Part); - break; - } - case VPInstruction::ActiveLaneMask: { - // Get first lane of vector induction variable. - Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); - // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), Part); - - auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); - Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); - State.set(this, Call, Part); - break; - } - case VPInstruction::FirstOrderRecurrenceSplice: { - // Generate code to combine the previous and current values in vector v3. - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - - // For the first part, use the recurrence phi (v1), otherwise v2. - auto *V1 = State.get(getOperand(0), 0); - Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); - if (!PartMinus1->getType()->isVectorTy()) { - State.set(this, PartMinus1, Part); - } else { - Value *V2 = State.get(getOperand(1), Part); - State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part); - } - break; - } - - case VPInstruction::CanonicalIVIncrement: - case VPInstruction::CanonicalIVIncrementNUW: { - Value *Next = nullptr; - if (Part == 0) { - bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; - auto *Phi = State.get(getOperand(0), 0); - // The loop step is equal to the vectorization factor (num of SIMD - // elements) times the unroll factor (num of SIMD instructions). - Value *Step = - createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); - } else { - Next = State.get(this, 0); - } - - State.set(this, Next, Part); - break; - } - case VPInstruction::BranchOnCount: { - if (Part != 0) - break; - // First create the compare. - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *Cond = Builder.CreateICmpEQ(IV, TC); - - // Now create the branch. - auto *Plan = getParent()->getPlan(); - VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); - VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); - if (Header->empty()) { - assert(EnableVPlanNativePath && - "empty entry block only expected in VPlanNativePath"); - Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); +void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, + Value *CanonicalIVStartValue, + VPTransformState &State, + bool IsEpilogueVectorization) { + + VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); + auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); + // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when + // preparing to execute the plan for the main vector loop. + if (!IsEpilogueVectorization && Term && + Term->getOpcode() == VPInstruction::BranchOnCount && + isa<ConstantInt>(TripCountV)) { + ConstantInt *C = cast<ConstantInt>(TripCountV); + uint64_t TCVal = C->getZExtValue(); + if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { + auto *BOC = + new VPInstruction(VPInstruction::BranchOnCond, + {getOrAddExternalDef(State.Builder.getTrue())}); + Term->eraseFromParent(); + ExitingVPBB->appendRecipe(BOC); + // TODO: Further simplifications are possible + // 1. Replace inductions with constants. + // 2. Replace vector loop region with VPBasicBlock. } - // TODO: Once the exit block is modeled in VPlan, use it instead of going - // through State.CFG.LastBB. - BasicBlock *Exit = - cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0); - - Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]); - Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - break; - } - default: - llvm_unreachable("Unsupported opcode for instruction"); - } -} - -void VPInstruction::execute(VPTransformState &State) { - assert(!State.Instance && "VPInstruction executing an Instance"); - IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(FMF); - for (unsigned Part = 0; Part < State.UF; ++Part) - generateInstruction(State, Part); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInstruction::dump() const { - VPSlotTracker SlotTracker(getParent()->getPlan()); - print(dbgs(), "", SlotTracker); -} - -void VPInstruction::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - - if (hasResult()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - - switch (getOpcode()) { - case VPInstruction::Not: - O << "not"; - break; - case VPInstruction::ICmpULE: - O << "icmp ule"; - break; - case VPInstruction::SLPLoad: - O << "combined load"; - break; - case VPInstruction::SLPStore: - O << "combined store"; - break; - case VPInstruction::ActiveLaneMask: - O << "active lane mask"; - break; - case VPInstruction::FirstOrderRecurrenceSplice: - O << "first-order splice"; - break; - case VPInstruction::CanonicalIVIncrement: - O << "VF * UF + "; - break; - case VPInstruction::CanonicalIVIncrementNUW: - O << "VF * UF +(nuw) "; - break; - case VPInstruction::BranchOnCount: - O << "branch-on-count "; - break; - default: - O << Instruction::getOpcodeName(getOpcode()); - } - - O << FMF; - - for (const VPValue *Operand : operands()) { - O << " "; - Operand->printAsOperand(O, SlotTracker); } - if (DL) { - O << ", !dbg "; - DL.print(O); - } -} -#endif - -void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { - // Make sure the VPInstruction is a floating-point operation. - assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || - Opcode == Instruction::FNeg || Opcode == Instruction::FSub || - Opcode == Instruction::FDiv || Opcode == Instruction::FRem || - Opcode == Instruction::FCmp) && - "this op can't take fast-math flags"); - FMF = FMFNew; -} - -void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, - Value *CanonicalIVStartValue, - VPTransformState &State) { // Check if the trip count is needed, and if so build it. if (TripCount && TripCount->getNumUsers()) { for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) @@ -868,111 +617,78 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, // When vectorizing the epilogue loop, the canonical induction start value // needs to be changed from zero to the value after the main vector loop. if (CanonicalIVStartValue) { - VPValue *VPV = new VPValue(CanonicalIVStartValue); - addExternalDef(VPV); + VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue); auto *IV = getCanonicalIV(); assert(all_of(IV->users(), [](const VPUser *U) { + if (isa<VPScalarIVStepsRecipe>(U)) + return true; auto *VPI = cast<VPInstruction>(U); return VPI->getOpcode() == VPInstruction::CanonicalIVIncrement || VPI->getOpcode() == VPInstruction::CanonicalIVIncrementNUW; }) && - "the canonical IV should only be used by its increments when " + "the canonical IV should only be used by its increments or " + "ScalarIVSteps when " "resetting the start value"); IV->setOperand(0, VPV); } } -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. +/// Generate the code inside the preheader and body of the vectorized loop. +/// Assumes a single pre-header basic-block was created for this. Introduce +/// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { - // 0. Set the reverse mapping from VPValues to Values for code generation. + // Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; - BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; - State->CFG.VectorPreHeader = VectorPreHeaderBB; - BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); - assert(VectorHeaderBB && "Loop preheader does not have a single successor."); - - // 1. Make room to generate basic-blocks inside loop body if needed. - BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( - VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); - Loop *L = State->LI->getLoopFor(VectorHeaderBB); - L->addBasicBlockToLoop(VectorLatchBB, *State->LI); - // Remove the edge between Header and Latch to allow other connections. - // Temporarily terminate with unreachable until CFG is rewired. - // Note: this asserts the generated code's assumption that - // getFirstInsertionPt() can be dereferenced into an Instruction. - VectorHeaderBB->getTerminator()->eraseFromParent(); - State->Builder.SetInsertPoint(VectorHeaderBB); - UnreachableInst *Terminator = State->Builder.CreateUnreachable(); - State->Builder.SetInsertPoint(Terminator); - - // 2. Generate code in loop body. + // Initialize CFG state. State->CFG.PrevVPBB = nullptr; - State->CFG.PrevBB = VectorHeaderBB; - State->CFG.LastBB = VectorLatchBB; + State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); + BasicBlock *VectorPreHeader = State->CFG.PrevBB; + State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); + // Generate code in the loop pre-header and body. for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); - // Setup branch terminator successors for VPBBs in VPBBsToFix based on - // VPBB's successors. - for (auto VPBB : State->CFG.VPBBsToFix) { - assert(EnableVPlanNativePath && - "Unexpected VPBBsToFix in non VPlan-native path"); - BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; - assert(BB && "Unexpected null basic block for VPBB"); - - unsigned Idx = 0; - auto *BBTerminator = BB->getTerminator(); - - for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { - VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); - BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); - ++Idx; - } - } - - // 3. Merge the temporary latch created with the last basic-block filled. - BasicBlock *LastBB = State->CFG.PrevBB; - assert(isa<BranchInst>(LastBB->getTerminator()) && - "Expected VPlan CFG to terminate with branch"); - - // Move both the branch and check from LastBB to VectorLatchBB. - auto *LastBranch = cast<BranchInst>(LastBB->getTerminator()); - LastBranch->moveBefore(VectorLatchBB->getTerminator()); - VectorLatchBB->getTerminator()->eraseFromParent(); - // Move condition so it is guaranteed to be next to branch. This is only done - // to avoid excessive test updates. - // TODO: Remove special handling once the increments for all inductions are - // modeled explicitly in VPlan. - cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch); - // Connect LastBB to VectorLatchBB to facilitate their merge. - BranchInst::Create(VectorLatchBB, LastBB); - - // Merge LastBB with Latch. - bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); - (void)Merged; - assert(Merged && "Could not merge last basic block with latch."); - VectorLatchBB = LastBB; + VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); - if (Header->empty()) { - assert(EnableVPlanNativePath); - Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); - } + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { // Skip phi-like recipes that generate their backedege values themselves. - // TODO: Model their backedge values explicitly. - if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R)) + if (isa<VPWidenPHIRecipe>(&R)) + continue; + + if (isa<VPWidenPointerInductionRecipe>(&R) || + isa<VPWidenIntOrFpInductionRecipe>(&R)) { + PHINode *Phi = nullptr; + if (isa<VPWidenIntOrFpInductionRecipe>(&R)) { + Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0)); + } else { + auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R); + // TODO: Split off the case that all users of a pointer phi are scalar + // from the VPWidenPointerInductionRecipe. + if (WidenPhi->onlyScalarsGenerated(State->VF)) + continue; + + auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0)); + Phi = cast<PHINode>(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); continue; + } auto *PhiR = cast<VPHeaderPHIRecipe>(&R); // For canonical IV, first-order recurrences and in-order reduction phis, @@ -993,9 +709,12 @@ void VPlan::execute(VPTransformState *State) { } // We do not attempt to preserve DT for outer loop vectorization currently. - if (!EnableVPlanNativePath) - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, - L->getExitBlock()); + if (!EnableVPlanNativePath) { + BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header]; + State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader); + updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB, + State->CFG.ExitBB); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1021,6 +740,17 @@ void VPlan::print(raw_ostream &O) const { O << '\n'; Block->print(O, "", SlotTracker); } + + if (!LiveOuts.empty()) + O << "\n"; + for (auto &KV : LiveOuts) { + O << "Live-out "; + KV.second->getPhi()->printAsOperand(O); + O << " = "; + KV.second->getOperand(0)->printAsOperand(O, SlotTracker); + O << "\n"; + } + O << "}\n"; } @@ -1034,11 +764,14 @@ LLVM_DUMP_METHOD void VPlan::dump() const { print(dbgs()); } #endif -void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, +void VPlan::addLiveOut(PHINode *PN, VPValue *V) { + assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists"); + LiveOuts.insert({PN, new VPLiveOut(PN, V)}); +} + +void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB, BasicBlock *LoopLatchBB, BasicBlock *LoopExitBB) { - BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); - assert(LoopHeaderBB && "Loop preheader does not have a single successor."); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -1075,6 +808,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Twine VPlanPrinter::getUID(const VPBlockBase *Block) { return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + Twine(getOrCreateBID(Block)); @@ -1122,8 +856,8 @@ void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, const Twine &Label) { // Due to "dot" we print an edge between two regions as an edge between the - // exit basic block and the entry basic of the respective regions. - const VPBlockBase *Tail = From->getExitBasicBlock(); + // exiting basic block and the entry basic of the respective regions. + const VPBlockBase *Tail = From->getExitingBasicBlock(); const VPBlockBase *Head = To->getEntryBasicBlock(); OS << Indent << getUID(Tail) << " -> " << getUID(Head); OS << " [ label=\"" << Label << '\"'; @@ -1213,328 +947,6 @@ void VPlanIngredient::print(raw_ostream &O) const { V->printAsOperand(O, false); } -void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-CALL "; - - auto *CI = cast<CallInst>(getUnderlyingInstr()); - if (CI->getType()->isVoidTy()) - O << "void "; - else { - printAsOperand(O, SlotTracker); - O << " = "; - } - - O << "call @" << CI->getCalledFunction()->getName() << "("; - printOperands(O, SlotTracker); - O << ")"; -} - -void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-SELECT "; - printAsOperand(O, SlotTracker); - O << " = select "; - getOperand(0)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(1)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(2)->printAsOperand(O, SlotTracker); - O << (InvariantCond ? " (condition is loop invariant)" : ""); -} - -void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; - printOperands(O, SlotTracker); -} - -void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-INDUCTION"; - if (getTruncInst()) { - O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" "; - getVPValue(0)->printAsOperand(O, SlotTracker); - } else - O << " " << VPlanIngredient(IV); -} -#endif - -bool VPWidenIntOrFpInductionRecipe::isCanonical() const { - auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); - return StartC && StartC->isZero() && StepC && StepC->isOne(); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-GEP "; - O << (IsPtrLoopInvariant ? "Inv" : "Var"); - size_t IndicesNumber = IsIndexLoopInvariant.size(); - for (size_t I = 0; I < IndicesNumber; ++I) - O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; - - O << " "; - printAsOperand(O, SlotTracker); - O << " = getelementptr "; - printOperands(O, SlotTracker); -} - -void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-PHI "; - - auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); - // Unless all incoming values are modeled in VPlan print the original PHI - // directly. - // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming - // values as VPValues. - if (getNumOperands() != OriginalPhi->getNumOperands()) { - O << VPlanIngredient(OriginalPhi); - return; - } - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} - -void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "BLEND "; - Phi->printAsOperand(O, false); - O << " ="; - if (getNumIncomingValues() == 1) { - // Not a User of any mask: not really blending, this is a - // single-predecessor phi. - O << " "; - getIncomingValue(0)->printAsOperand(O, SlotTracker); - } else { - for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { - O << " "; - getIncomingValue(I)->printAsOperand(O, SlotTracker); - O << "/"; - getMask(I)->printAsOperand(O, SlotTracker); - } - } -} - -void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " +"; - if (isa<FPMathOperator>(getUnderlyingInstr())) - O << getUnderlyingInstr()->getFastMathFlags(); - O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; - getVecOp()->printAsOperand(O, SlotTracker); - if (getCondOp()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} - -void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); - - if (!getUnderlyingInstr()->getType()->isVoidTy()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; - printOperands(O, SlotTracker); - - if (AlsoPack) - O << " (S->V)"; -} - -void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "PHI-PREDICATED-INSTRUCTION "; - printAsOperand(O, SlotTracker); - O << " = "; - printOperands(O, SlotTracker); -} - -void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - - if (!isStore()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; - - printOperands(O, SlotTracker); -} -#endif - -void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { - Value *Start = getStartValue()->getLiveInIRValue(); - PHINode *EntryPart = PHINode::Create( - Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); - EntryPart->addIncoming(Start, State.CFG.VectorPreHeader); - EntryPart->setDebugLoc(DL); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, EntryPart, Part); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = CANONICAL-INDUCTION"; -} -#endif - -void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.get(getOperand(0), 0); - Type *STy = CanonicalIV->getType(); - IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - ElementCount VF = State.VF; - Value *VStart = VF.isScalar() - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *VStep = createStepForVF(Builder, STy, VF, Part); - if (VF.isVector()) { - VStep = Builder.CreateVectorSplat(VF, VStep); - VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); - } - Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); - State.set(this, CanonicalVectorIV, Part); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = WIDEN-CANONICAL-INDUCTION "; - printOperands(O, SlotTracker); -} -#endif - -void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - // Create a vector from the initial value. - auto *VectorInit = getStartValue()->getLiveInIRValue(); - - Type *VecTy = State.VF.isScalar() - ? VectorInit->getType() - : VectorType::get(VectorInit->getType(), State.VF); - - if (State.VF.isVector()) { - auto *IdxTy = Builder.getInt32Ty(); - auto *One = ConstantInt::get(IdxTy, 1); - IRBuilder<>::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); - auto *LastIdx = Builder.CreateSub(RuntimeVF, One); - VectorInit = Builder.CreateInsertElement( - PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); - } - - // Create a phi node for the new recurrence. - PHINode *EntryPart = PHINode::Create( - VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt()); - EntryPart->addIncoming(VectorInit, State.CFG.VectorPreHeader); - State.set(this, EntryPart, 0); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} -#endif - -void VPReductionPHIRecipe::execute(VPTransformState &State) { - PHINode *PN = cast<PHINode>(getUnderlyingValue()); - auto &Builder = State.Builder; - - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #1: We create a new vector PHI node with no incoming edges. We'll use - // this value when we vectorize all of the instructions that use the PHI. - bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = - ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); - - BasicBlock *HeaderBB = State.CFG.PrevBB; - assert(State.LI->getLoopFor(HeaderBB)->getHeader() == HeaderBB && - "recipe must be in the vector loop header"); - unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = - PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt()); - State.set(this, EntryPart, Part); - } - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - VPValue *StartVPV = getStartValue(); - Value *StartV = StartVPV->getLiveInIRValue(); - - Value *Iden = nullptr; - RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || - RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { - // MinMax reduction have the start value as their identify. - if (ScalarPHI) { - Iden = StartV; - } else { - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); - StartV = Iden = - Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); - } - } else { - Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), - RdxDesc.getFastMathFlags()); - - if (!ScalarPHI) { - Iden = Builder.CreateVectorSplat(State.VF, Iden); - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); - } - } - - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = State.get(this, Part); - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? StartV : Iden; - cast<PHINode>(EntryPart)->addIncoming(StartVal, State.CFG.VectorPreHeader); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-REDUCTION-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} #endif template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); @@ -1594,7 +1006,10 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, continue; assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); auto *VPInst = cast<VPInstruction>(&VPI); - auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); + + auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue()); + if (!Inst) + continue; auto *IG = IAI.getInterleaveGroup(Inst); if (!IG) continue; @@ -1622,7 +1037,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI) { Old2NewTy Old2New; - visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI); + visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI); } void VPSlotTracker::assignSlot(const VPValue *V) { @@ -1632,8 +1047,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) { void VPSlotTracker::assignSlots(const VPlan &Plan) { - for (const VPValue *V : Plan.VPExternalDefs) - assignSlot(V); + for (const auto &P : Plan.VPExternalDefs) + assignSlot(P.second); assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) @@ -1651,7 +1066,19 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { } bool vputils::onlyFirstLaneUsed(VPValue *Def) { - return all_of(Def->users(), [Def](VPUser *U) { - return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(Def); - }); + return all_of(Def->users(), + [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); }); +} + +VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, + ScalarEvolution &SE) { + if (auto *E = dyn_cast<SCEVConstant>(Expr)) + return Plan.getOrAddExternalDef(E->getValue()); + if (auto *E = dyn_cast<SCEVUnknown>(Expr)) + return Plan.getOrAddExternalDef(E->getValue()); + + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + VPValue *Step = new VPExpandSCEVRecipe(Expr, SE); + Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef())); + return Step; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index bcaabca692cc..09da4a545d0d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -25,27 +25,26 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H -#include "VPlanLoopInfo.h" #include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DebugLoc.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/Support/InstructionCost.h" +#include "llvm/IR/FMF.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" #include <algorithm> #include <cassert> #include <cstddef> -#include <map> #include <string> namespace llvm { @@ -54,6 +53,7 @@ class BasicBlock; class DominatorTree; class InductionDescriptor; class InnerLoopVectorizer; +class IRBuilderBase; class LoopInfo; class raw_ostream; class RecurrenceDescriptor; @@ -67,10 +67,11 @@ class VPlanSlp; /// Returns a calculation for the total number of elements for a given \p VF. /// For fixed width vectors this value is a constant, whereas for scalable /// vectors it is an expression determined at runtime. -Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); +Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); /// Return a value for Step multiplied by VF. -Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step); +Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, + int64_t Step); /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: @@ -151,7 +152,7 @@ public: /// Returns an expression describing the lane index that can be used at /// runtime. - Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const; + Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const; /// Returns the Kind of lane offset. Kind getKind() const { return LaneKind; } @@ -199,10 +200,10 @@ struct VPIteration { /// needed for generating the output IR. struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, - DominatorTree *DT, IRBuilder<> &Builder, + DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) { - } + : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), + LVer(nullptr) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; @@ -298,6 +299,27 @@ struct VPTransformState { Iter->second[Instance.Part][CacheIdx] = V; } + /// Add additional metadata to \p To that was not present on \p Orig. + /// + /// Currently this is used to add the noalias annotations based on the + /// inserted memchecks. Use this for instructions that are *cloned* into the + /// vector loop. + void addNewMetadata(Instruction *To, const Instruction *Orig); + + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Instruction *To, Instruction *From); + + /// Similar to the previous function but it adds the metadata to a + /// vector of instructions. + void addMetadata(ArrayRef<Value *> To, Instruction *From); + + /// Set the debug location in the builder using the debug location in \p V. + void setDebugLocFromInst(const Value *V); + /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. struct CFGState { @@ -308,26 +330,19 @@ struct VPTransformState { /// header BasicBlock. BasicBlock *PrevBB = nullptr; - /// The last IR BasicBlock in the output IR. Set to the new latch - /// BasicBlock, used for placing the newly created BasicBlocks. - BasicBlock *LastBB = nullptr; - - /// The IR BasicBlock that is the preheader of the vector loop in the output - /// IR. - /// FIXME: The vector preheader should also be modeled in VPlan, so any code - /// that needs to be added to the preheader gets directly generated by - /// VPlan. There should be no need to manage a pointer to the IR BasicBlock. - BasicBlock *VectorPreHeader = nullptr; + /// The last IR BasicBlock in the output IR. Set to the exit block of the + /// vector loop. + BasicBlock *ExitBB = nullptr; /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case /// of replication, maps the BasicBlock of the last replica created. SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; - /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed - /// up at the end of vector code generation. - SmallVector<VPBasicBlock *, 8> VPBBsToFix; - CFGState() = default; + + /// Returns the BasicBlock* mapped to the pre-header of the loop region + /// containing \p R. + BasicBlock *getPreheaderBBFor(VPRecipeBase *R); } CFG; /// Hold a pointer to LoopInfo to register new basic blocks in the loop. @@ -337,7 +352,7 @@ struct VPTransformState { DominatorTree *DT; /// Hold a reference to the IRBuilder used to generate output IR code. - IRBuilder<> &Builder; + IRBuilderBase &Builder; VPValue2ValueTy VPValue2Value; @@ -353,41 +368,16 @@ struct VPTransformState { /// Holds recipes that may generate a poison value that is used after /// vectorization, even when their operands are not poison. SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes; -}; - -/// VPUsers instance used by VPBlockBase to manage CondBit and the block -/// predicate. Currently VPBlockUsers are used in VPBlockBase for historical -/// reasons, but in the future the only VPUsers should either be recipes or -/// live-outs.VPBlockBase uses. -struct VPBlockUser : public VPUser { - VPBlockUser() : VPUser({}, VPUserID::Block) {} - VPValue *getSingleOperandOrNull() { - if (getNumOperands() == 1) - return getOperand(0); + /// The loop object for the current parent region, or nullptr. + Loop *CurrentVectorLoop = nullptr; - return nullptr; - } - const VPValue *getSingleOperandOrNull() const { - if (getNumOperands() == 1) - return getOperand(0); - - return nullptr; - } - - void resetSingleOpUser(VPValue *NewVal) { - assert(getNumOperands() <= 1 && "Didn't expect more than one operand!"); - if (!NewVal) { - if (getNumOperands() == 1) - removeLastOperand(); - return; - } - - if (getNumOperands() == 1) - setOperand(0, NewVal); - else - addOperand(NewVal); - } + /// LoopVersioning. It's only set up (non-null) if memchecks were + /// used. + /// + /// This is currently only used to add no-alias metadata based on the + /// memchecks. The actually versioning is performed manually. + std::unique_ptr<LoopVersioning> LVer; }; /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. @@ -410,16 +400,6 @@ class VPBlockBase { /// List of successor blocks. SmallVector<VPBlockBase *, 1> Successors; - /// Successor selector managed by a VPUser. For blocks with zero or one - /// successors, there is no operand. Otherwise there is exactly one operand - /// which is the branch condition. - VPBlockUser CondBitUser; - - /// If the block is predicated, its predicate is stored as an operand of this - /// VPUser to maintain the def-use relations. Otherwise there is no operand - /// here. - VPBlockUser PredicateUser; - /// VPlan containing the block. Can only be set on the entry block of the /// plan. VPlan *Plan = nullptr; @@ -493,11 +473,11 @@ public: const VPBasicBlock *getEntryBasicBlock() const; VPBasicBlock *getEntryBasicBlock(); - /// \return the VPBasicBlock that is the exit of this VPBlockBase, + /// \return the VPBasicBlock that is the exiting this VPBlockBase, /// recursively, if the latter is a VPRegionBlock. Otherwise, if this /// VPBlockBase is a VPBasicBlock, it is returned. - const VPBasicBlock *getExitBasicBlock() const; - VPBasicBlock *getExitBasicBlock(); + const VPBasicBlock *getExitingBasicBlock() const; + VPBasicBlock *getExitingBasicBlock(); const VPBlocksTy &getSuccessors() const { return Successors; } VPBlocksTy &getSuccessors() { return Successors; } @@ -565,20 +545,6 @@ public: return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); } - /// \return the condition bit selecting the successor. - VPValue *getCondBit(); - /// \return the condition bit selecting the successor. - const VPValue *getCondBit() const; - /// Set the condition bit selecting the successor. - void setCondBit(VPValue *CV); - - /// \return the block's predicate. - VPValue *getPredicate(); - /// \return the block's predicate. - const VPValue *getPredicate() const; - /// Set the block's predicate. - void setPredicate(VPValue *Pred); - /// Set a given VPBlockBase \p Successor as the single successor of this /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. /// This VPBlockBase must have no successors. @@ -588,14 +554,11 @@ public: } /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two - /// successors of this VPBlockBase. \p Condition is set as the successor - /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p - /// IfFalse. This VPBlockBase must have no successors. - void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse, - VPValue *Condition) { + /// successors of this VPBlockBase. This VPBlockBase is not added as + /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no + /// successors. + void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { assert(Successors.empty() && "Setting two successors when others exist."); - assert(Condition && "Setting two successors without condition!"); - setCondBit(Condition); appendSuccessor(IfTrue); appendSuccessor(IfFalse); } @@ -612,11 +575,8 @@ public: /// Remove all the predecessor of this block. void clearPredecessors() { Predecessors.clear(); } - /// Remove all the successors of this block and set to null its condition bit - void clearSuccessors() { - Successors.clear(); - setCondBit(nullptr); - } + /// Remove all the successors of this block. + void clearSuccessors() { Successors.clear(); } /// The method which generates the output IR that correspond to this /// VPBlockBase, thereby "executing" the VPlan. @@ -665,6 +625,32 @@ public: #endif }; +/// A value that is used outside the VPlan. The operand of the user needs to be +/// added to the associated LCSSA phi node. +class VPLiveOut : public VPUser { + PHINode *Phi; + +public: + VPLiveOut(PHINode *Phi, VPValue *Op) + : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} + + /// Fixup the wrapped LCSSA phi node in the unique exit block. This simply + /// means we need to add the appropriate incoming value from the middle + /// block as exiting edges from the scalar epilogue loop (if present) are + /// already in place, and we exit the vector loop exclusively to the middle + /// block. + void fixPhi(VPlan &Plan, VPTransformState &State); + + /// Returns true if the VPLiveOut uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + PHINode *getPhi() const { return Phi; } +}; + /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -699,6 +685,9 @@ public: /// Insert an unlinked recipe into a basic block immediately before /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); + /// Insert an unlinked recipe into \p BB immediately before the insertion + /// point \p IP; + void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP); /// Insert an unlinked Recipe into a basic block immediately after /// the specified Recipe. @@ -759,14 +748,6 @@ public: bool mayReadOrWriteMemory() const { return mayReadFromMemory() || mayWriteToMemory(); } - - /// Returns true if the recipe only uses the first lane of operand \p Op. - /// Conservatively returns false. - virtual bool onlyFirstLaneUsed(const VPValue *Op) const { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return false; - } }; inline bool VPUser::classof(const VPDef *Def) { @@ -804,6 +785,7 @@ public: CanonicalIVIncrement, CanonicalIVIncrementNUW, BranchOnCount, + BranchOnCond }; private: @@ -892,6 +874,7 @@ public: case Instruction::Unreachable: case Instruction::Fence: case Instruction::AtomicRMW: + case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: return false; default: @@ -1049,27 +1032,25 @@ public: }; /// A recipe for handling phi nodes of integer and floating-point inductions, -/// producing their vector and scalar values. +/// producing their vector values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; const InductionDescriptor &IndDesc; - bool NeedsScalarIV; bool NeedsVectorIV; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - bool NeedsScalarIV, bool NeedsVectorIV) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this), - IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV), + bool NeedsVectorIV) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}), + VPValue(IV, this), IV(IV), IndDesc(IndDesc), NeedsVectorIV(NeedsVectorIV) {} - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - TruncInst *Trunc, bool NeedsScalarIV, - bool NeedsVectorIV) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this), - IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV), + TruncInst *Trunc, bool NeedsVectorIV) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}), + VPValue(Trunc, this), IV(IV), IndDesc(IndDesc), NeedsVectorIV(NeedsVectorIV) {} ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1093,6 +1074,10 @@ public: VPValue *getStartValue() { return getOperand(0); } const VPValue *getStartValue() const { return getOperand(0); } + /// Returns the step value of the induction. + VPValue *getStepValue() { return getOperand(1); } + const VPValue *getStepValue() const { return getOperand(1); } + /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { @@ -1102,6 +1087,8 @@ public: return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); } + PHINode *getPHINode() { return IV; } + /// Returns the induction descriptor for the recipe. const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } @@ -1115,9 +1102,6 @@ public: return TruncI ? TruncI->getType() : IV->getType(); } - /// Returns true if a scalar phi needs to be created for the induction. - bool needsScalarIV() const { return NeedsScalarIV; } - /// Returns true if a vector phi needs to be created for the induction. bool needsVectorIV() const { return NeedsVectorIV; } }; @@ -1167,6 +1151,9 @@ public: VPValue *getStartValue() { return getNumOperands() == 0 ? nullptr : getOperand(0); } + VPValue *getStartValue() const { + return getNumOperands() == 0 ? nullptr : getOperand(0); + } /// Returns the incoming value from the loop backedge. VPValue *getBackedgeValue() { @@ -1180,6 +1167,52 @@ public: } }; +class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { + const InductionDescriptor &IndDesc; + + /// SCEV used to expand step. + /// FIXME: move expansion of step to the pre-header, once it is modeled + /// explicitly. + ScalarEvolution &SE; + +public: + /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p + /// Start. + VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, + const InductionDescriptor &IndDesc, + ScalarEvolution &SE) + : VPHeaderPHIRecipe(VPVWidenPointerInductionSC, VPWidenPointerInductionSC, + Phi), + IndDesc(IndDesc), SE(SE) { + addOperand(Start); + } + + ~VPWidenPointerInductionRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *B) { + return B->getVPDefID() == VPRecipeBase::VPWidenPointerInductionSC; + } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenPointerInductionSC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenPointerInductionSC; + } + + /// Generate vector values for the pointer induction. + void execute(VPTransformState &State) override; + + /// Returns true if only scalar values will be generated. + bool onlyScalarsGenerated(ElementCount VF); + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for handling header phis that are widened in the vector loop. /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are /// managed in the recipe directly. @@ -1363,9 +1396,8 @@ public: "Op must be an operand of the recipe"); // Recursing through Blend recipes only, must terminate at header phi's the // latest. - return all_of(users(), [this](VPUser *U) { - return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(this); - }); + return all_of(users(), + [this](VPUser *U) { return U->onlyFirstLaneUsed(this); }); } }; @@ -1440,6 +1472,15 @@ public: unsigned getNumStoreOperands() const { return getNumOperands() - (HasMask ? 2 : 1); } + + /// The recipe only uses the first lane of the address. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() && all_of(getStoredValues(), [Op](VPValue *StoredV) { + return Op != StoredV; + }); + } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -1551,6 +1592,13 @@ public: "Op must be an operand of the recipe"); return isUniform(); } + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// A recipe for generating conditional branches on the bits of a mask. @@ -1590,6 +1638,13 @@ public: // Mask is optional. return getNumOperands() == 1 ? getOperand(0) : nullptr; } + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when @@ -1619,6 +1674,13 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// A Recipe for widening load/store operations. @@ -1627,7 +1689,7 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue { +class VPWidenMemoryInstructionRecipe : public VPRecipeBase { Instruction &Ingredient; // Whether the loaded-from / stored-to addresses are consecutive. @@ -1649,10 +1711,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue { public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, bool Consecutive, bool Reverse) - : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), - VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load), + : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } @@ -1660,7 +1722,6 @@ public: VPValue *StoredValue, VPValue *Mask, bool Consecutive, bool Reverse) : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}), - VPValue(VPValue::VPVMemoryInstructionSC, &Store, this), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); setMask(Mask); @@ -1714,9 +1775,42 @@ public: "Op must be an operand of the recipe"); // Widened, consecutive memory operations only demand the first lane of - // their address. - return Op == getAddr() && isConsecutive(); + // their address, unless the same operand is also stored. That latter can + // happen with opaque pointers. + return Op == getAddr() && isConsecutive() && + (!isStore() || Op != getStoredValue()); + } + + Instruction &getIngredient() const { return Ingredient; } +}; + +/// Recipe to expand a SCEV expression. +class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue { + const SCEV *Expr; + ScalarEvolution &SE; + +public: + VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE) + : VPRecipeBase(VPExpandSCEVSC, {}), VPValue(nullptr, this), Expr(Expr), + SE(SE) {} + + ~VPExpandSCEVRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPExpandSCEVSC; } + + /// Generate a canonical vector induction variable of the vector loop, with + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + const SCEV *getSCEV() const { return Expr; } }; /// Canonical scalar induction phi of the vector loop. Starting at the specified @@ -1738,6 +1832,12 @@ public: static inline bool classof(const VPDef *D) { return D->getVPDefID() == VPCanonicalIVPHISC; } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPCanonicalIVPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC; + } /// Generate the canonical scalar induction phi of the vector loop. void execute(VPTransformState &State) override; @@ -1803,6 +1903,64 @@ public: } }; +/// A recipe for handling phi nodes of integer and floating-point inductions, +/// producing their scalar values. +class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue { + /// Scalar type to use for the generated values. + Type *Ty; + /// If not nullptr, truncate the generated values to TruncToTy. + Type *TruncToTy; + const InductionDescriptor &IndDesc; + +public: + VPScalarIVStepsRecipe(Type *Ty, const InductionDescriptor &IndDesc, + VPValue *CanonicalIV, VPValue *Start, VPValue *Step, + Type *TruncToTy) + : VPRecipeBase(VPScalarIVStepsSC, {CanonicalIV, Start, Step}), + VPValue(nullptr, this), Ty(Ty), TruncToTy(TruncToTy), IndDesc(IndDesc) { + } + + ~VPScalarIVStepsRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC; + } + /// Extra classof implementations to allow directly casting from VPUser -> + /// VPScalarIVStepsRecipe. + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC; + } + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC; + } + + /// Generate the scalarized versions of the phi node as needed by their users. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the induction is canonical, i.e. starting at 0 and + /// incremented by UF * VF (= the original IV is incremented by 1). + bool isCanonical() const; + + VPCanonicalIVPHIRecipe *getCanonicalIV() const; + VPValue *getStartValue() const { return getOperand(1); } + VPValue *getStepValue() const { return getOperand(2); } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. @@ -1895,6 +2053,8 @@ public: /// SplitAt to the new block. Returns the new block. VPBasicBlock *splitAt(iterator SplitAt); + VPRegionBlock *getEnclosingLoopRegion(); + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. @@ -1906,6 +2066,14 @@ public: using VPBlockBase::print; // Get the print(raw_stream &O) version. #endif + /// If the block has multiple successors, return the branch recipe terminating + /// the block. If there are no or only a single successor, return nullptr; + VPRecipeBase *getTerminator(); + const VPRecipeBase *getTerminator() const; + + /// Returns true if the block is exiting it's parent region. + bool isExiting() const; + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1913,7 +2081,7 @@ private: }; /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks -/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG. +/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG. /// A VPRegionBlock may indicate that its contents are to be replicated several /// times. This is designed to support predicated scalarization, in which a /// scalar if-then code structure needs to be generated VF * UF times. Having @@ -1924,25 +2092,26 @@ class VPRegionBlock : public VPBlockBase { /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. VPBlockBase *Entry; - /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock. - VPBlockBase *Exit; + /// Hold the Single Exiting block of the SESE region modelled by the + /// VPRegionBlock. + VPBlockBase *Exiting; /// An indicator whether this region is to generate multiple replicated /// instances of output IR corresponding to its VPBlockBases. bool IsReplicator; public: - VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit, + VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name = "", bool IsReplicator = false) - : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit), + : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting), IsReplicator(IsReplicator) { assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); - assert(Exit->getSuccessors().empty() && "Exit block has successors."); + assert(Exiting->getSuccessors().empty() && "Exit block has successors."); Entry->setParent(this); - Exit->setParent(this); + Exiting->setParent(this); } VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) - : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr), + : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), IsReplicator(IsReplicator) {} ~VPRegionBlock() override { @@ -1976,16 +2145,22 @@ public: // DominatorTreeBase representing the Graph type. VPBlockBase &front() const { return *Entry; } - const VPBlockBase *getExit() const { return Exit; } - VPBlockBase *getExit() { return Exit; } + const VPBlockBase *getExiting() const { return Exiting; } + VPBlockBase *getExiting() { return Exiting; } - /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p - /// ExitBlock must have no successors. - void setExit(VPBlockBase *ExitBlock) { - assert(ExitBlock->getSuccessors().empty() && + /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p + /// ExitingBlock must have no successors. + void setExiting(VPBlockBase *ExitingBlock) { + assert(ExitingBlock->getSuccessors().empty() && "Exit block cannot have successors."); - Exit = ExitBlock; - ExitBlock->setParent(this); + Exiting = ExitingBlock; + ExitingBlock->setParent(this); + } + + /// Returns the pre-header VPBasicBlock of the loop region. + VPBasicBlock *getPreheaderVPBB() { + assert(!isReplicator() && "should only get pre-header of loop regions"); + return getSinglePredecessor()->getExitingBasicBlock(); } /// An indicator whether this region is to generate multiple replicated @@ -2119,11 +2294,11 @@ struct GraphTraits<Inverse<VPRegionBlock *>> using nodes_iterator = df_iterator<NodeRef>; static NodeRef getEntryNode(Inverse<GraphRef> N) { - return N.Graph->getExit(); + return N.Graph->getExiting(); } static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getExit()); + return nodes_iterator::begin(N->getExiting()); } static nodes_iterator nodes_end(GraphRef N) { @@ -2281,12 +2456,9 @@ class VPlan { /// Holds the name of the VPlan, for printing. std::string Name; - /// Holds all the external definitions created for this VPlan. - // TODO: Introduce a specific representation for external definitions in - // VPlan. External definitions must be immutable and hold a pointer to its - // underlying IR that will be used to implement its structural comparison - // (operators '==' and '<'). - SetVector<VPValue *> VPExternalDefs; + /// Holds all the external definitions created for this VPlan. External + /// definitions must be immutable and hold a pointer to their underlying IR. + DenseMap<Value *, VPValue *> VPExternalDefs; /// Represents the trip count of the original loop, for folding /// the tail. @@ -2307,13 +2479,13 @@ class VPlan { /// to be free when the plan's destructor is called. SmallVector<VPValue *, 16> VPValuesToFree; - /// Holds the VPLoopInfo analysis for this VPlan. - VPLoopInfo VPLInfo; - /// Indicates whether it is safe use the Value2VPValue mapping or if the /// mapping cannot be used any longer, because it is stale. bool Value2VPValueEnabled = true; + /// Values used outside the plan. + MapVector<PHINode *, VPLiveOut *> LiveOuts; + public: VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { if (Entry) @@ -2321,6 +2493,8 @@ public: } ~VPlan() { + clearLiveOuts(); + if (Entry) { VPValue DummyValue; for (VPBlockBase *Block : depth_first(Entry)) @@ -2334,13 +2508,14 @@ public: delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; - for (VPValue *Def : VPExternalDefs) - delete Def; + for (auto &P : VPExternalDefs) + delete P.second; } /// Prepare the plan for execution, setting up the required live-in values. void prepareToExecute(Value *TripCount, Value *VectorTripCount, - Value *CanonicalIVStartValue, VPTransformState &State); + Value *CanonicalIVStartValue, VPTransformState &State, + bool IsEpilogueVectorization); /// Generate the IR code for this VPlan. void execute(struct VPTransformState *State); @@ -2383,9 +2558,13 @@ public: void setName(const Twine &newName) { Name = newName.str(); } - /// Add \p VPVal to the pool of external definitions if it's not already - /// in the pool. - void addExternalDef(VPValue *VPVal) { VPExternalDefs.insert(VPVal); } + /// Get the existing or add a new external definition for \p V. + VPValue *getOrAddExternalDef(Value *V) { + auto I = VPExternalDefs.insert({V, nullptr}); + if (I.second) + I.first->second = new VPValue(V); + return I.first->second; + } void addVPValue(Value *V) { assert(Value2VPValueEnabled && @@ -2432,10 +2611,6 @@ public: Value2VPValue.erase(V); } - /// Return the VPLoopInfo analysis for this VPlan. - VPLoopInfo &getVPLoopInfo() { return VPLInfo; } - const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPlan to \p O. void print(raw_ostream &O) const; @@ -2465,7 +2640,10 @@ public: /// Returns the VPRegionBlock of the vector loop. VPRegionBlock *getVectorLoopRegion() { - return cast<VPRegionBlock>(getEntry()); + return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); + } + const VPRegionBlock *getVectorLoopRegion() const { + return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); } /// Returns the canonical induction recipe of the vector loop. @@ -2478,6 +2656,23 @@ public: return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } + void addLiveOut(PHINode *PN, VPValue *V); + + void clearLiveOuts() { + for (auto &KV : LiveOuts) + delete KV.second; + LiveOuts.clear(); + } + + void removeLiveOut(PHINode *PN) { + delete LiveOuts[PN]; + LiveOuts.erase(PN); + } + + const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const { + return LiveOuts; + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2567,9 +2762,8 @@ public: /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's - /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's - /// conditional bit is propagated to \p NewBlock. \p NewBlock must have - /// neither successors nor predecessors. + /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must + /// have neither successors nor predecessors. static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { assert(NewBlock->getSuccessors().empty() && NewBlock->getPredecessors().empty() && @@ -2580,24 +2774,22 @@ public: disconnectBlocks(BlockPtr, Succ); connectBlocks(NewBlock, Succ); } - NewBlock->setCondBit(BlockPtr->getCondBit()); - BlockPtr->setCondBit(nullptr); connectBlocks(BlockPtr, NewBlock); } /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr - /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor - /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse - /// must have neither successors nor predecessors. + /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors + /// and \p IfTrue and \p IfFalse must have neither successors nor + /// predecessors. static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, - VPValue *Condition, VPBlockBase *BlockPtr) { + VPBlockBase *BlockPtr) { assert(IfTrue->getSuccessors().empty() && "Can't insert IfTrue with successors."); assert(IfFalse->getSuccessors().empty() && "Can't insert IfFalse with successors."); - BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition); + BlockPtr->setTwoSuccessors(IfTrue, IfFalse); IfTrue->setPredecessors({BlockPtr}); IfFalse->setPredecessors({BlockPtr}); IfTrue->setParent(BlockPtr->getParent()); @@ -2639,8 +2831,8 @@ public: R.moveBefore(*PredVPBB, PredVPBB->end()); VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); auto *ParentRegion = cast<VPRegionBlock>(Block->getParent()); - if (ParentRegion->getExit() == Block) - ParentRegion->setExit(PredVPBB); + if (ParentRegion->getExiting() == Block) + ParentRegion->setExiting(PredVPBB); SmallVector<VPBlockBase *> Successors(Block->successors()); for (auto *Succ : Successors) { VPBlockUtils::disconnectBlocks(Block, Succ); @@ -2650,41 +2842,6 @@ public: return PredVPBB; } - /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. - static bool isBackEdge(const VPBlockBase *FromBlock, - const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { - assert(FromBlock->getParent() == ToBlock->getParent() && - FromBlock->getParent() && "Must be in same region"); - const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock); - const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock); - if (!FromLoop || !ToLoop || FromLoop != ToLoop) - return false; - - // A back-edge is a branch from the loop latch to its header. - return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader(); - } - - /// Returns true if \p Block is a loop latch - static bool blockIsLoopLatch(const VPBlockBase *Block, - const VPLoopInfo *VPLInfo) { - if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block)) - return ParentVPL->isLoopLatch(Block); - - return false; - } - - /// Count and return the number of succesors of \p PredBlock excluding any - /// backedges. - static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock, - VPLoopInfo *VPLI) { - unsigned Count = 0; - for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) { - if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI)) - Count++; - } - return Count; - } - /// Return an iterator range over \p Range which only includes \p BlockTy /// blocks. The accesses are casted to \p BlockTy. template <typename BlockTy, typename T> @@ -2845,6 +3002,13 @@ namespace vputils { /// Returns true if only the first lane of \p Def is used. bool onlyFirstLaneUsed(VPValue *Def); +/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p +/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in +/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's +/// pre-header already contains a recipe expanding \p Expr, return it. If not, +/// create a new one. +VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, + ScalarEvolution &SE); } // end namespace vputils } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 379988733312..84b0dac862b6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -42,9 +42,6 @@ private: // Vectorization plan that we are working on. VPlan &Plan; - // Output Top Region. - VPRegionBlock *TopRegion = nullptr; - // Builder of the VPlan instruction-level representation. VPBuilder VPIRBuilder; @@ -59,6 +56,9 @@ private: // Hold phi node's that need to be fixed once the plain CFG has been built. SmallVector<PHINode *, 8> PhisToFix; + /// Maps loops in the original IR to their corresponding region. + DenseMap<Loop *, VPRegionBlock *> Loop2Region; + // Utility functions. void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); void fixPhiNodes(); @@ -73,8 +73,9 @@ public: PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) : TheLoop(Lp), LI(LI), Plan(P) {} - // Build the plain CFG and return its Top Region. - VPRegionBlock *buildPlainCFG(); + /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected + /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG. + VPBasicBlock *buildPlainCFG(); }; } // anonymous namespace @@ -106,19 +107,32 @@ void PlainCFGBuilder::fixPhiNodes() { } } -// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an -// existing one if it was already created. +// Create a new empty VPBasicBlock for an incoming BasicBlock in the region +// corresponding to the containing loop or retrieve an existing one if it was +// already created. If no region exists yet for the loop containing \p BB, a new +// one is created. VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { auto BlockIt = BB2VPBB.find(BB); if (BlockIt != BB2VPBB.end()) // Retrieve existing VPBB. return BlockIt->second; + // Get or create a region for the loop containing BB. + Loop *CurrentLoop = LI->getLoopFor(BB); + VPRegionBlock *ParentR = nullptr; + if (CurrentLoop) { + auto Iter = Loop2Region.insert({CurrentLoop, nullptr}); + if (Iter.second) + Iter.first->second = new VPRegionBlock( + CurrentLoop->getHeader()->getName().str(), false /*isReplicator*/); + ParentR = Iter.first->second; + } + // Create new VPBB. LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n"); VPBasicBlock *VPBB = new VPBasicBlock(BB->getName()); BB2VPBB[BB] = VPBB; - VPBB->setParent(TopRegion); + VPBB->setParent(ParentR); return VPBB; } @@ -182,8 +196,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) { // A and B: Create VPValue and add it to the pool of external definitions and // to the Value->VPValue map. - VPValue *NewVPVal = new VPValue(IRVal); - Plan.addExternalDef(NewVPVal); + VPValue *NewVPVal = Plan.getOrAddExternalDef(IRVal); IRDef2VPValue[IRVal] = NewVPVal; return NewVPVal; } @@ -203,10 +216,13 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, "Instruction shouldn't have been visited."); if (auto *Br = dyn_cast<BranchInst>(Inst)) { - // Branch instruction is not explicitly represented in VPlan but we need - // to represent its condition bit when it's conditional. - if (Br->isConditional()) - getOrCreateVPOperand(Br->getCondition()); + // Conditional branch instruction are represented using BranchOnCond + // recipes. + if (Br->isConditional()) { + VPValue *Cond = getOrCreateVPOperand(Br->getCondition()); + VPBB->appendRecipe( + new VPInstruction(VPInstruction::BranchOnCond, {Cond})); + } // Skip the rest of the Instruction processing for Branch instructions. continue; @@ -238,11 +254,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } // Main interface to build the plain CFG. -VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { - // 1. Create the Top Region. It will be the parent of all VPBBs. - TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/); - - // 2. Scan the body of the loop in a topological order to visit each basic +VPBasicBlock *PlainCFGBuilder::buildPlainCFG() { + // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for // each BB and link it to its successor and predecessor VPBBs. Note that // predecessors must be set in the same order as they are in the incomming IR. @@ -251,21 +264,20 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { // Loop PH needs to be explicitly visited since it's not taken into account by // LoopBlocksDFS. - BasicBlock *PreheaderBB = TheLoop->getLoopPreheader(); - assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) && + BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader(); + assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) && "Unexpected loop preheader"); - VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB); - for (auto &I : *PreheaderBB) { + VPBasicBlock *ThePreheaderVPBB = getOrCreateVPBB(ThePreheaderBB); + ThePreheaderVPBB->setName("vector.ph"); + for (auto &I : *ThePreheaderBB) { if (I.getType()->isVoidTy()) continue; - VPValue *VPV = new VPValue(&I); - Plan.addExternalDef(VPV); - IRDef2VPValue[&I] = VPV; + IRDef2VPValue[&I] = Plan.getOrAddExternalDef(&I); } // Create empty VPBB for Loop H so that we can link PH->H. VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader()); - // Preheader's predecessors will be set during the loop RPO traversal below. - PreheaderVPBB->setOneSuccessor(HeaderVPBB); + HeaderVPBB->setName("vector.body"); + ThePreheaderVPBB->setOneSuccessor(HeaderVPBB); LoopBlocksRPO RPO(TheLoop); RPO.perform(LI); @@ -295,16 +307,13 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { // Get VPBB's condition bit. assert(isa<BranchInst>(TI) && "Unsupported terminator!"); - auto *Br = cast<BranchInst>(TI); - Value *BrCond = Br->getCondition(); // Look up the branch condition to get the corresponding VPValue // representing the condition bit in VPlan (which may be in another VPBB). - assert(IRDef2VPValue.count(BrCond) && + assert(IRDef2VPValue.count(cast<BranchInst>(TI)->getCondition()) && "Missing condition bit in IRDef2VPValue!"); - VPValue *VPCondBit = IRDef2VPValue[BrCond]; - // Link successors using condition bit. - VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit); + // Link successors. + VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1); } else llvm_unreachable("Number of successors not supported."); @@ -312,30 +321,61 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { setVPBBPredsFromBB(VPBB, BB); } - // 3. Process outermost loop exit. We created an empty VPBB for the loop + // 2. Process outermost loop exit. We created an empty VPBB for the loop // single exit BB during the RPO traversal of the loop body but Instructions // weren't visited because it's not part of the the loop. BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock(); assert(LoopExitBB && "Loops with multiple exits are not supported."); VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB]; - createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB); // Loop exit was already set as successor of the loop exiting BB. // We only set its predecessor VPBB now. setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB); + // 3. Fix up region blocks for loops. For each loop, + // * use the header block as entry to the corresponding region, + // * use the latch block as exit of the corresponding region, + // * set the region as successor of the loop pre-header, and + // * set the exit block as successor to the region. + SmallVector<Loop *> LoopWorkList; + LoopWorkList.push_back(TheLoop); + while (!LoopWorkList.empty()) { + Loop *L = LoopWorkList.pop_back_val(); + BasicBlock *Header = L->getHeader(); + BasicBlock *Exiting = L->getLoopLatch(); + assert(Exiting == L->getExitingBlock() && + "Latch must be the only exiting block"); + VPRegionBlock *Region = Loop2Region[L]; + VPBasicBlock *HeaderVPBB = getOrCreateVPBB(Header); + VPBasicBlock *ExitingVPBB = getOrCreateVPBB(Exiting); + + // Disconnect backedge and pre-header from header. + VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(L->getLoopPreheader()); + VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB); + VPBlockUtils::disconnectBlocks(ExitingVPBB, HeaderVPBB); + + Region->setParent(PreheaderVPBB->getParent()); + Region->setEntry(HeaderVPBB); + VPBlockUtils::connectBlocks(PreheaderVPBB, Region); + + // Disconnect exit block from exiting (=latch) block, set exiting block and + // connect region to exit block. + VPBasicBlock *ExitVPBB = getOrCreateVPBB(L->getExitBlock()); + VPBlockUtils::disconnectBlocks(ExitingVPBB, ExitVPBB); + Region->setExiting(ExitingVPBB); + VPBlockUtils::connectBlocks(Region, ExitVPBB); + + // Queue sub-loops for processing. + LoopWorkList.append(L->begin(), L->end()); + } // 4. The whole CFG has been built at this point so all the input Values must // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding // VPlan operands. fixPhiNodes(); - // 5. Final Top Region setup. Set outermost loop pre-header and single exit as - // Top Region entry and exit. - TopRegion->setEntry(PreheaderVPBB); - TopRegion->setExit(LoopExitVPBB); - return TopRegion; + return ThePreheaderVPBB; } -VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() { +VPBasicBlock *VPlanHCFGBuilder::buildPlainCFG() { PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan); return PCFGBuilder.buildPlainCFG(); } @@ -343,20 +383,15 @@ VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() { // Public interface to build a H-CFG. void VPlanHCFGBuilder::buildHierarchicalCFG() { // Build Top Region enclosing the plain CFG and set it as VPlan entry. - VPRegionBlock *TopRegion = buildPlainCFG(); - Plan.setEntry(TopRegion); + VPBasicBlock *EntryVPBB = buildPlainCFG(); + Plan.setEntry(EntryVPBB); LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan); + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); Verifier.verifyHierarchicalCFG(TopRegion); // Compute plain CFG dom tree for VPLInfo. VPDomTree.recalculate(*TopRegion); LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n"; VPDomTree.print(dbgs())); - - // Compute VPLInfo and keep it in Plan. - VPLoopInfo &VPLInfo = Plan.getVPLoopInfo(); - VPLInfo.analyze(VPDomTree); - LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n"; - VPLInfo.print(dbgs())); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index 238ee7e6347c..2d52990af268 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -24,13 +24,15 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H -#include "VPlan.h" #include "VPlanDominatorTree.h" #include "VPlanVerifier.h" namespace llvm { class Loop; +class LoopInfo; +class VPRegionBlock; +class VPlan; class VPlanTestBase; /// Main class to build the VPlan H-CFG for an incoming IR. @@ -55,9 +57,9 @@ private: // are introduced. VPDominatorTree VPDomTree; - /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion) - /// enclosing the plain CFG. - VPRegionBlock *buildPlainCFG(); + /// Build plain CFG for TheLoop. Return the pre-header VPBasicBlock connected + /// to a new VPRegionBlock (TopRegion) enclosing the plain CFG. + VPBasicBlock *buildPlainCFG(); public: VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h deleted file mode 100644 index 5208f2d58e2b..000000000000 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanLoopInfo.h +++ /dev/null @@ -1,44 +0,0 @@ -//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a -/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization -/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further -/// information can be found in VectorizationPlanner.rst. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H - -#include "llvm/Analysis/LoopInfoImpl.h" - -namespace llvm { -class VPBlockBase; - -/// Hold analysis information for every loop detected by VPLoopInfo. It is an -/// instantiation of LoopBase. -class VPLoop : public LoopBase<VPBlockBase, VPLoop> { -private: - friend class LoopInfoBase<VPBlockBase, VPLoop>; - explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {} -}; - -/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based -/// Hierarchical CFG. It is a specialization of LoopInfoBase class. -// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which -// is the same as the incoming IR CFG. If it's more efficient than running the -// whole loop detection algorithm, we may want to create a mechanism to -// translate LoopInfo into VPLoopInfo. However, that would require significant -// changes in LoopInfoBase class. -typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo; - -} // namespace llvm - -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp deleted file mode 100644 index e879a33db6ee..000000000000 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ /dev/null @@ -1,248 +0,0 @@ -//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements the VPlanPredicator class which contains the public -/// interfaces to predicate and linearize the VPlan region. -/// -//===----------------------------------------------------------------------===// - -#include "VPlanPredicator.h" -#include "VPlan.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/GraphTraits.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define DEBUG_TYPE "VPlanPredicator" - -using namespace llvm; - -// Generate VPInstructions at the beginning of CurrBB that calculate the -// predicate being propagated from PredBB to CurrBB depending on the edge type -// between them. For example if: -// i. PredBB is controlled by predicate %BP, and -// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition -// bit value %CBV then this function will generate the following two -// VPInstructions at the start of CurrBB: -// %IntermediateVal = not %CBV -// %FinalVal = and %BP %IntermediateVal -// It returns %FinalVal. -VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB, - VPBasicBlock *CurrBB) { - VPValue *CBV = PredBB->getCondBit(); - - // Set the intermediate value - this is either 'CBV', or 'not CBV' - // depending on the edge type. - EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB); - VPValue *IntermediateVal = nullptr; - switch (ET) { - case EdgeType::TRUE_EDGE: - // CurrBB is the true successor of PredBB - nothing to do here. - IntermediateVal = CBV; - break; - - case EdgeType::FALSE_EDGE: - // CurrBB is the False successor of PredBB - compute not of CBV. - IntermediateVal = Builder.createNot(CBV, {}); - break; - } - - // Now AND intermediate value with PredBB's block predicate if it has one. - VPValue *BP = PredBB->getPredicate(); - if (BP) - return Builder.createAnd(BP, IntermediateVal, {}); - else - return IntermediateVal; -} - -// Generate a tree of ORs for all IncomingPredicates in WorkList. -// Note: This function destroys the original Worklist. -// -// P1 P2 P3 P4 P5 -// \ / \ / / -// OR1 OR2 / -// \ | / -// \ +/-+ -// \ / | -// OR3 | -// \ | -// OR4 <- Returns this -// | -// -// The algorithm uses a worklist of predicates as its main data structure. -// We pop a pair of values from the front (e.g. P1 and P2), generate an OR -// (in this example OR1), and push it back. In this example the worklist -// contains {P3, P4, P5, OR1}. -// The process iterates until we have only one element in the Worklist (OR4). -// The last element is the root predicate which is returned. -VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) { - if (Worklist.empty()) - return nullptr; - - // The worklist initially contains all the leaf nodes. Initialize the tree - // using them. - while (Worklist.size() >= 2) { - // Pop a pair of values from the front. - VPValue *LHS = Worklist.front(); - Worklist.pop_front(); - VPValue *RHS = Worklist.front(); - Worklist.pop_front(); - - // Create an OR of these values. - VPValue *Or = Builder.createOr(LHS, RHS, {}); - - // Push OR to the back of the worklist. - Worklist.push_back(Or); - } - - assert(Worklist.size() == 1 && "Expected 1 item in worklist"); - - // The root is the last node in the worklist. - VPValue *Root = Worklist.front(); - - // This root needs to replace the existing block predicate. This is done in - // the caller function. - return Root; -} - -// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE -VPlanPredicator::EdgeType -VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock, - VPBlockBase *ToBlock) { - unsigned Count = 0; - for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) { - if (SuccBlock == ToBlock) { - assert(Count < 2 && "Switch not supported currently"); - return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE; - } - Count++; - } - - llvm_unreachable("Broken getEdgeTypeBetween"); -} - -// Generate all predicates needed for CurrBlock by going through its immediate -// predecessor blocks. -void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock, - VPRegionBlock *Region) { - // Blocks that dominate region exit inherit the predicate from the region. - // Return after setting the predicate. - if (VPDomTree.dominates(CurrBlock, Region->getExit())) { - VPValue *RegionBP = Region->getPredicate(); - CurrBlock->setPredicate(RegionBP); - return; - } - - // Collect all incoming predicates in a worklist. - std::list<VPValue *> IncomingPredicates; - - // Set the builder's insertion point to the top of the current BB - VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock()); - Builder.setInsertPoint(CurrBB, CurrBB->begin()); - - // For each predecessor, generate the VPInstructions required for - // computing 'BP AND (not) CBV" at the top of CurrBB. - // Collect the outcome of this calculation for all predecessors - // into IncomingPredicates. - for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) { - // Skip back-edges - if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI)) - continue; - - VPValue *IncomingPredicate = nullptr; - unsigned NumPredSuccsNoBE = - VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI); - - // If there is an unconditional branch to the currBB, then we don't create - // edge predicates. We use the predecessor's block predicate instead. - if (NumPredSuccsNoBE == 1) - IncomingPredicate = PredBlock->getPredicate(); - else if (NumPredSuccsNoBE == 2) { - // Emit recipes into CurrBlock if required - assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits"); - IncomingPredicate = - getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB); - } else - llvm_unreachable("FIXME: switch statement ?"); - - if (IncomingPredicate) - IncomingPredicates.push_back(IncomingPredicate); - } - - // Logically OR all incoming predicates by building the Predicate Tree. - VPValue *Predicate = genPredicateTree(IncomingPredicates); - - // Now update the block's predicate with the new one. - CurrBlock->setPredicate(Predicate); -} - -// Generate all predicates needed for Region. -void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) { - VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry()); - ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock); - - // Generate edge predicates and append them to the block predicate. RPO is - // necessary since the predecessor blocks' block predicate needs to be set - // before the current block's block predicate can be computed. - for (VPBlockBase *Block : RPOT) { - // TODO: Handle nested regions once we start generating the same. - assert(!isa<VPRegionBlock>(Block) && "Nested region not expected"); - createOrPropagatePredicates(Block, Region); - } -} - -// Linearize the CFG within Region. -// TODO: Predication and linearization need RPOT for every region. -// This traversal is expensive. Since predication is not adding new -// blocks, we should be able to compute RPOT once in predication and -// reuse it here. This becomes even more important once we have nested -// regions. -void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { - ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); - VPBlockBase *PrevBlock = nullptr; - - for (VPBlockBase *CurrBlock : RPOT) { - // TODO: Handle nested regions once we start generating the same. - assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected"); - - // Linearize control flow by adding an unconditional edge between PrevBlock - // and CurrBlock skipping loop headers and latches to keep intact loop - // header predecessors and loop latch successors. - if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) && - !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) { - - LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->" - << CurrBlock->getName() << "\n"); - - PrevBlock->clearSuccessors(); - CurrBlock->clearPredecessors(); - VPBlockUtils::connectBlocks(PrevBlock, CurrBlock); - } - - PrevBlock = CurrBlock; - } -} - -// Entry point. The driver function for the predicator. -void VPlanPredicator::predicate() { - // Predicate the blocks within Region. - predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry())); - - // Linearlize the blocks with Region. - linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry())); -} - -VPlanPredicator::VPlanPredicator(VPlan &Plan) - : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) { - // FIXME: Predicator is currently computing the dominator information for the - // top region. Once we start storing dominator information in a VPRegionBlock, - // we can avoid this recalculation. - VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry()))); -} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h deleted file mode 100644 index a5db9a54da3c..000000000000 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.h +++ /dev/null @@ -1,74 +0,0 @@ -//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file defines the VPlanPredicator class which contains the public -/// interfaces to predicate and linearize the VPlan region. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H - -#include "LoopVectorizationPlanner.h" -#include "VPlan.h" -#include "VPlanDominatorTree.h" - -namespace llvm { - -class VPlanPredicator { -private: - enum class EdgeType { - TRUE_EDGE, - FALSE_EDGE, - }; - - // VPlan being predicated. - VPlan &Plan; - - // VPLoopInfo for Plan's HCFG. - VPLoopInfo *VPLI; - - // Dominator tree for Plan's HCFG. - VPDominatorTree VPDomTree; - - // VPlan builder used to generate VPInstructions for block predicates. - VPBuilder Builder; - - /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if - /// \p ToBlock is either the unconditional successor or the conditional true - /// successor of \p FromBlock and FALSE_EDGE otherwise. - EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock); - - /// Create and return VPValue corresponding to the predicate for the edge from - /// \p PredBB to \p CurrentBlock. - VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB); - - /// Generate and return the result of ORing all the predicate VPValues in \p - /// Worklist. - VPValue *genPredicateTree(std::list<VPValue *> &Worklist); - - /// Create or propagate predicate for \p CurrBlock in region \p Region using - /// predicate(s) of its predecessor(s) - void createOrPropagatePredicates(VPBlockBase *CurrBlock, - VPRegionBlock *Region); - - /// Predicate the CFG within \p Region. - void predicateRegionRec(VPRegionBlock *Region); - - /// Linearize the CFG within \p Region. - void linearizeRegionRec(VPRegionBlock *Region); - -public: - VPlanPredicator(VPlan &Plan); - - /// Predicate Plan's HCFG. - void predicate(); -}; -} // end namespace llvm -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp new file mode 100644 index 000000000000..92422b17457c --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -0,0 +1,840 @@ +//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains implementations for different VPlan recipes. +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include <cassert> + +using namespace llvm; + +extern cl::opt<bool> EnableVPlanNativePath; + +bool VPRecipeBase::mayWriteToMemory() const { + switch (getVPDefID()) { + case VPWidenMemoryInstructionSC: { + return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); + } + case VPReplicateSC: + case VPWidenCallSC: + return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) + ->mayWriteToMemory(); + case VPBranchOnMaskSC: + return false; + case VPWidenIntOrFpInductionSC: + case VPWidenCanonicalIVSC: + case VPWidenPHISC: + case VPBlendSC: + case VPWidenSC: + case VPWidenGEPSC: + case VPReductionSC: + case VPWidenSelectSC: { + const Instruction *I = + dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); + (void)I; + assert((!I || !I->mayWriteToMemory()) && + "underlying instruction may write to memory"); + return false; + } + default: + return true; + } +} + +bool VPRecipeBase::mayReadFromMemory() const { + switch (getVPDefID()) { + case VPWidenMemoryInstructionSC: { + return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); + } + case VPReplicateSC: + case VPWidenCallSC: + return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) + ->mayReadFromMemory(); + case VPBranchOnMaskSC: + return false; + case VPWidenIntOrFpInductionSC: + case VPWidenCanonicalIVSC: + case VPWidenPHISC: + case VPBlendSC: + case VPWidenSC: + case VPWidenGEPSC: + case VPReductionSC: + case VPWidenSelectSC: { + const Instruction *I = + dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); + (void)I; + assert((!I || !I->mayReadFromMemory()) && + "underlying instruction may read from memory"); + return false; + } + default: + return true; + } +} + +bool VPRecipeBase::mayHaveSideEffects() const { + switch (getVPDefID()) { + case VPWidenIntOrFpInductionSC: + case VPWidenPointerInductionSC: + case VPWidenCanonicalIVSC: + case VPWidenPHISC: + case VPBlendSC: + case VPWidenSC: + case VPWidenGEPSC: + case VPReductionSC: + case VPWidenSelectSC: + case VPScalarIVStepsSC: { + const Instruction *I = + dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); + (void)I; + assert((!I || !I->mayHaveSideEffects()) && + "underlying instruction has side-effects"); + return false; + } + case VPReplicateSC: { + auto *R = cast<VPReplicateRecipe>(this); + return R->getUnderlyingInstr()->mayHaveSideEffects(); + } + default: + return true; + } +} + +void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { + auto Lane = VPLane::getLastLaneForVF(State.VF); + VPValue *ExitValue = getOperand(0); + if (Plan.isUniformAfterVectorization(ExitValue)) + Lane = VPLane::getFirstLane(); + Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), + State.Builder.GetInsertBlock()); +} + +void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); + Parent = InsertPos->getParent(); + Parent->getRecipeList().insert(InsertPos->getIterator(), this); +} + +void VPRecipeBase::insertBefore(VPBasicBlock &BB, + iplist<VPRecipeBase>::iterator I) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(I == BB.end() || I->getParent() == &BB); + Parent = &BB; + BB.getRecipeList().insert(I, this); +} + +void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); + Parent = InsertPos->getParent(); + Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); +} + +void VPRecipeBase::removeFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); + getParent()->getRecipeList().remove(getIterator()); + Parent = nullptr; +} + +iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); + return getParent()->getRecipeList().erase(getIterator()); +} + +void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { + removeFromParent(); + insertAfter(InsertPos); +} + +void VPRecipeBase::moveBefore(VPBasicBlock &BB, + iplist<VPRecipeBase>::iterator I) { + removeFromParent(); + insertBefore(BB, I); +} + +void VPInstruction::generateInstruction(VPTransformState &State, + unsigned Part) { + IRBuilderBase &Builder = State.Builder; + Builder.SetCurrentDebugLocation(DL); + + if (Instruction::isBinaryOp(getOpcode())) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); + State.set(this, V, Part); + return; + } + + switch (getOpcode()) { + case VPInstruction::Not: { + Value *A = State.get(getOperand(0), Part); + Value *V = Builder.CreateNot(A); + State.set(this, V, Part); + break; + } + case VPInstruction::ICmpULE: { + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *V = Builder.CreateICmpULE(IV, TC); + State.set(this, V, Part); + break; + } + case Instruction::Select: { + Value *Cond = State.get(getOperand(0), Part); + Value *Op1 = State.get(getOperand(1), Part); + Value *Op2 = State.get(getOperand(2), Part); + Value *V = Builder.CreateSelect(Cond, Op1, Op2); + State.set(this, V, Part); + break; + } + case VPInstruction::ActiveLaneMask: { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + // Get the original loop tripcount. + Value *ScalarTC = State.get(getOperand(1), Part); + + auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); + Instruction *Call = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); + State.set(this, Call, Part); + break; + } + case VPInstruction::FirstOrderRecurrenceSplice: { + // Generate code to combine the previous and current values in vector v3. + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3]; + // v3 = vector(v1(3), v2(0, 1, 2)) + + // For the first part, use the recurrence phi (v1), otherwise v2. + auto *V1 = State.get(getOperand(0), 0); + Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); + if (!PartMinus1->getType()->isVectorTy()) { + State.set(this, PartMinus1, Part); + } else { + Value *V2 = State.get(getOperand(1), Part); + State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part); + } + break; + } + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: { + Value *Next = nullptr; + if (Part == 0) { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = + createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + } else { + Next = State.get(this, 0); + } + + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCond: { + if (Part != 0) + break; + + Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); + VPRegionBlock *ParentRegion = getParent()->getParent(); + VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); + + // Replace the temporary unreachable terminator with a new conditional + // branch, hooking it up to backward destination for exiting blocks now and + // to forward destination(s) later when they are created. + BranchInst *CondBr = + Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); + + if (getParent()->isExiting()) + CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); + + CondBr->setSuccessor(0, nullptr); + Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); + break; + } + case VPInstruction::BranchOnCount: { + if (Part != 0) + break; + // First create the compare. + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *Cond = Builder.CreateICmpEQ(IV, TC); + + // Now create the branch. + auto *Plan = getParent()->getPlan(); + VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); + + // Replace the temporary unreachable terminator with a new conditional + // branch, hooking it up to backward destination (the header) now and to the + // forward destination (the exit/middle block) later when it is created. + // Note that CreateCondBr expects a valid BB as first argument, so we need + // to set it to nullptr later. + BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), + State.CFG.VPBB2IRBB[Header]); + CondBr->setSuccessor(0, nullptr); + Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); + break; + } + default: + llvm_unreachable("Unsupported opcode for instruction"); + } +} + +void VPInstruction::execute(VPTransformState &State) { + assert(!State.Instance && "VPInstruction executing an Instance"); + IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); + State.Builder.setFastMathFlags(FMF); + for (unsigned Part = 0; Part < State.UF; ++Part) + generateInstruction(State, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInstruction::dump() const { + VPSlotTracker SlotTracker(getParent()->getPlan()); + print(dbgs(), "", SlotTracker); +} + +void VPInstruction::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + + if (hasResult()) { + printAsOperand(O, SlotTracker); + O << " = "; + } + + switch (getOpcode()) { + case VPInstruction::Not: + O << "not"; + break; + case VPInstruction::ICmpULE: + O << "icmp ule"; + break; + case VPInstruction::SLPLoad: + O << "combined load"; + break; + case VPInstruction::SLPStore: + O << "combined store"; + break; + case VPInstruction::ActiveLaneMask: + O << "active lane mask"; + break; + case VPInstruction::FirstOrderRecurrenceSplice: + O << "first-order splice"; + break; + case VPInstruction::CanonicalIVIncrement: + O << "VF * UF + "; + break; + case VPInstruction::CanonicalIVIncrementNUW: + O << "VF * UF +(nuw) "; + break; + case VPInstruction::BranchOnCond: + O << "branch-on-cond"; + break; + case VPInstruction::BranchOnCount: + O << "branch-on-count "; + break; + default: + O << Instruction::getOpcodeName(getOpcode()); + } + + O << FMF; + + for (const VPValue *Operand : operands()) { + O << " "; + Operand->printAsOperand(O, SlotTracker); + } + + if (DL) { + O << ", !dbg "; + DL.print(O); + } +} +#endif + +void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { + // Make sure the VPInstruction is a floating-point operation. + assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::FNeg || Opcode == Instruction::FSub || + Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FCmp) && + "this op can't take fast-math flags"); + FMF = FMFNew; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-CALL "; + + auto *CI = cast<CallInst>(getUnderlyingInstr()); + if (CI->getType()->isVoidTy()) + O << "void "; + else { + printAsOperand(O, SlotTracker); + O << " = "; + } + + O << "call @" << CI->getCalledFunction()->getName() << "("; + printOperands(O, SlotTracker); + O << ")"; +} + +void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-SELECT "; + printAsOperand(O, SlotTracker); + O << " = select "; + getOperand(0)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(1)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(2)->printAsOperand(O, SlotTracker); + O << (InvariantCond ? " (condition is loop invariant)" : ""); +} + +void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; + printOperands(O, SlotTracker); +} + +void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-INDUCTION"; + if (getTruncInst()) { + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; + O << " +\n" << Indent << "\" "; + getVPValue(0)->printAsOperand(O, SlotTracker); + } else + O << " " << VPlanIngredient(IV); + + O << ", "; + getStepValue()->printAsOperand(O, SlotTracker); +} +#endif + +bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); + auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); + return StartC && StartC->isZero() && StepC && StepC->isOne(); +} + +VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const { + return cast<VPCanonicalIVPHIRecipe>(getOperand(0)); +} + +bool VPScalarIVStepsRecipe::isCanonical() const { + auto *CanIV = getCanonicalIV(); + // The start value of the steps-recipe must match the start value of the + // canonical induction and it must step by 1. + if (CanIV->getStartValue() != getStartValue()) + return false; + auto *StepVPV = getStepValue(); + if (StepVPV->getDef()) + return false; + auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue()); + return StepC && StepC->isOne(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent; + printAsOperand(O, SlotTracker); + O << Indent << "= SCALAR-STEPS "; + printOperands(O, SlotTracker); +} + +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-GEP "; + O << (IsPtrLoopInvariant ? "Inv" : "Var"); + size_t IndicesNumber = IsIndexLoopInvariant.size(); + for (size_t I = 0; I < IndicesNumber; ++I) + O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; + + O << " "; + printAsOperand(O, SlotTracker); + O << " = getelementptr "; + printOperands(O, SlotTracker); +} + +void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "BLEND "; + Phi->printAsOperand(O, false); + O << " ="; + if (getNumIncomingValues() == 1) { + // Not a User of any mask: not really blending, this is a + // single-predecessor phi. + O << " "; + getIncomingValue(0)->printAsOperand(O, SlotTracker); + } else { + for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { + O << " "; + getIncomingValue(I)->printAsOperand(O, SlotTracker); + O << "/"; + getMask(I)->printAsOperand(O, SlotTracker); + } + } +} + +void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " +"; + if (isa<FPMathOperator>(getUnderlyingInstr())) + O << getUnderlyingInstr()->getFastMathFlags(); + O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + if (getCondOp()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; + if (RdxDesc->IntermediateStore) + O << " (with final reduction value stored in invariant address sank " + "outside of loop)"; +} + +void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); + + if (!getUnderlyingInstr()->getType()->isVoidTy()) { + printAsOperand(O, SlotTracker); + O << " = "; + } + if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { + O << "call @" << CB->getCalledFunction()->getName() << "("; + interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), + O, [&O, &SlotTracker](VPValue *Op) { + Op->printAsOperand(O, SlotTracker); + }); + O << ")"; + } else { + O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; + printOperands(O, SlotTracker); + } + + if (AlsoPack) + O << " (S->V)"; +} + +void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "PHI-PREDICATED-INSTRUCTION "; + printAsOperand(O, SlotTracker); + O << " = "; + printOperands(O, SlotTracker); +} + +void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + + if (!isStore()) { + getVPSingleValue()->printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; + + printOperands(O, SlotTracker); +} +#endif + +void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { + Value *Start = getStartValue()->getLiveInIRValue(); + PHINode *EntryPart = PHINode::Create( + Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + EntryPart->addIncoming(Start, VectorPH); + EntryPart->setDebugLoc(DL); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, EntryPart, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = CANONICAL-INDUCTION"; +} +#endif + +bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { + bool IsUniform = vputils::onlyFirstLaneUsed(this); + return all_of(users(), + [&](const VPUser *U) { return U->usesScalars(this); }) && + (IsUniform || !VF.isScalable()); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = WIDEN-POINTER-INDUCTION "; + getStartValue()->printAsOperand(O, SlotTracker); + O << ", " << *IndDesc.getStep(); +} +#endif + +void VPExpandSCEVRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "cannot be used in per-lane"); + const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); + SCEVExpander Exp(SE, DL, "induction"); + + Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), + &*State.Builder.GetInsertPoint()); + + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, Res, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + getVPSingleValue()->printAsOperand(O, SlotTracker); + O << " = EXPAND SCEV " << *Expr; +} +#endif + +void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { + Value *CanonicalIV = State.get(getOperand(0), 0); + Type *STy = CanonicalIV->getType(); + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + ElementCount VF = State.VF; + Value *VStart = VF.isScalar() + ? CanonicalIV + : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *VStep = createStepForVF(Builder, STy, VF, Part); + if (VF.isVector()) { + VStep = Builder.CreateVectorSplat(VF, VStep); + VStep = + Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); + } + Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); + State.set(this, CanonicalVectorIV, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = WIDEN-CANONICAL-INDUCTION "; + printOperands(O, SlotTracker); +} +#endif + +void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { + auto &Builder = State.Builder; + // Create a vector from the initial value. + auto *VectorInit = getStartValue()->getLiveInIRValue(); + + Type *VecTy = State.VF.isScalar() + ? VectorInit->getType() + : VectorType::get(VectorInit->getType(), State.VF); + + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + if (State.VF.isVector()) { + auto *IdxTy = Builder.getInt32Ty(); + auto *One = ConstantInt::get(IdxTy, 1); + IRBuilder<>::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); + auto *LastIdx = Builder.CreateSub(RuntimeVF, One); + VectorInit = Builder.CreateInsertElement( + PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); + } + + // Create a phi node for the new recurrence. + PHINode *EntryPart = PHINode::Create( + VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(VectorInit, VectorPH); + State.set(this, EntryPart, 0); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + +void VPReductionPHIRecipe::execute(VPTransformState &State) { + PHINode *PN = cast<PHINode>(getUnderlyingValue()); + auto &Builder = State.Builder; + + // In order to support recurrences we need to be able to vectorize Phi nodes. + // Phi nodes have cycles, so we need to vectorize them in two stages. This is + // stage #1: We create a new vector PHI node with no incoming edges. We'll use + // this value when we vectorize all of the instructions that use the PHI. + bool ScalarPHI = State.VF.isScalar() || IsInLoop; + Type *VecTy = + ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); + + BasicBlock *HeaderBB = State.CFG.PrevBB; + assert(State.CurrentVectorLoop->getHeader() == HeaderBB && + "recipe must be in the vector loop header"); + unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *EntryPart = + PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt()); + State.set(this, EntryPart, Part); + } + + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VPValue *StartVPV = getStartValue(); + Value *StartV = StartVPV->getLiveInIRValue(); + + Value *Iden = nullptr; + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || + RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { + // MinMax reduction have the start value as their identify. + if (ScalarPHI) { + Iden = StartV; + } else { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + StartV = Iden = + Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); + } + } else { + Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), + RdxDesc.getFastMathFlags()); + + if (!ScalarPHI) { + Iden = Builder.CreateVectorSplat(State.VF, Iden); + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } + } + + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *EntryPart = State.get(this, Part); + // Make sure to add the reduction start value only to the + // first unroll part. + Value *StartVal = (Part == 0) ? StartV : Iden; + cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-REDUCTION-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + +void VPWidenPHIRecipe::execute(VPTransformState &State) { + assert(EnableVPlanNativePath && + "Non-native vplans are not expected to have VPWidenPHIRecipes."); + + // Currently we enter here in the VPlan-native path for non-induction + // PHIs where all control flow is uniform. We simply widen these PHIs. + // Create a vector phi with no operands - the vector phi operands will be + // set at the end of vector code generation. + VPBasicBlock *Parent = getParent(); + VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion(); + unsigned StartIdx = 0; + // For phis in header blocks of loop regions, use the index of the value + // coming from the preheader. + if (LoopRegion->getEntryBasicBlock() == Parent) { + for (unsigned I = 0; I < getNumOperands(); ++I) { + if (getIncomingBlock(I) == + LoopRegion->getSinglePredecessor()->getExitingBasicBlock()) + StartIdx = I; + } + } + Value *Op0 = State.get(getOperand(StartIdx), 0); + Type *VecTy = Op0->getType(); + Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); + State.set(this, VecPhi, 0); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-PHI "; + + auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); + // Unless all incoming values are modeled in VPlan print the original PHI + // directly. + // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming + // values as VPValues. + if (getNumOperands() != OriginalPhi->getNumOperands()) { + O << VPlanIngredient(OriginalPhi); + return; + } + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 9e19e172dea5..3a7e77fd9efd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -15,16 +15,10 @@ //===----------------------------------------------------------------------===// #include "VPlan.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" +#include "VPlanValue.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/VectorUtils.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -32,12 +26,9 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> #include <cassert> -#include <iterator> #include <utility> using namespace llvm; @@ -396,7 +387,7 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { return markFailed(); assert(getOpcode(Values) && "Opcodes for all values must match"); - unsigned ValuesOpcode = getOpcode(Values).getValue(); + unsigned ValuesOpcode = *getOpcode(Values); SmallVector<VPValue *, 4> CombinedOperands; if (areCommutative(Values)) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 70ce773a8a85..cca484e13bf1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -13,6 +13,8 @@ #include "VPlanTransforms.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/IVDescriptors.h" using namespace llvm; @@ -22,17 +24,15 @@ void VPlanTransforms::VPInstructionsToVPRecipes( GetIntOrFpInductionDescriptor, SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) { - auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); - ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry()); - - for (VPBlockBase *Base : RPOT) { - // Do not widen instructions in pre-header and exit blocks. - if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0) - continue; - - VPBasicBlock *VPBB = Base->getEntryBasicBlock(); + ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>> + RPOT(Plan->getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { + VPRecipeBase *Term = VPBB->getTerminator(); + auto EndIter = Term ? Term->getIterator() : VPBB->end(); // Introduce each ingredient into VPlan. - for (VPRecipeBase &Ingredient : llvm::make_early_inc_range(*VPBB)) { + for (VPRecipeBase &Ingredient : + make_early_inc_range(make_range(VPBB->begin(), EndIter))) { + VPValue *VPV = Ingredient.getVPSingleValue(); Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue()); if (DeadInstructions.count(Inst)) { @@ -47,8 +47,10 @@ void VPlanTransforms::VPInstructionsToVPRecipes( auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); NewRecipe = - new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false, true); + new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -295,14 +297,19 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { } void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { - SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove; - for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) { + for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); if (!IV || IV->getTruncInst()) continue; - // Visit all casts connected to IV and in Casts. Collect them. - // remember them for removal. + // A sequence of IR Casts has potentially been recorded for IV, which + // *must be bypassed* when the IV is vectorized, because the vectorized IV + // will produce the desired casted value. This sequence forms a def-use + // chain and is provided in reverse order, ending with the cast that uses + // the IV phi. Search for the recipe of the last cast in the chain and + // replace it with the original IV. Note that only the final cast is + // expected to have users outside the cast-chain and the dead casts left + // over will be cleaned up later. auto &Casts = IV->getInductionDescriptor().getCastInsts(); VPValue *FindMyCast = IV; for (Instruction *IRCast : reverse(Casts)) { @@ -315,14 +322,9 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { break; } } - assert(FoundUserCast && "Missing a cast to remove"); - CastsToRemove.emplace_back(FoundUserCast, IV); FindMyCast = FoundUserCast->getVPSingleValue(); } - } - for (auto &E : CastsToRemove) { - E.first->getVPSingleValue()->replaceAllUsesWith(E.second); - E.first->eraseFromParent(); + FindMyCast->replaceAllUsesWith(IV); } } @@ -358,3 +360,73 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { } } } + +void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { + ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>> + RPOT(Plan.getEntry()); + + for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { + // The recipes in the block are processed in reverse order, to catch chains + // of dead recipes. + for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { + if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) { + return V->getNumUsers() > 0; + })) + continue; + R.eraseFromParent(); + } + } +} + +void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { + SmallVector<VPRecipeBase *> ToRemove; + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); + for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); + if (!IV) + continue; + if (HasOnlyVectorVFs && + none_of(IV->users(), [IV](VPUser *U) { return U->usesScalars(IV); })) + continue; + + const InductionDescriptor &ID = IV->getInductionDescriptor(); + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), SE); + Instruction *TruncI = IV->getTruncInst(); + VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe( + IV->getPHINode()->getType(), ID, Plan.getCanonicalIV(), + IV->getStartValue(), Step, TruncI ? TruncI->getType() : nullptr); + HeaderVPBB->insert(Steps, HeaderVPBB->getFirstNonPhi()); + + // Update scalar users of IV to use Step instead. Use SetVector to ensure + // the list of users doesn't contain duplicates. + SetVector<VPUser *> Users(IV->user_begin(), IV->user_end()); + for (VPUser *U : Users) { + if (HasOnlyVectorVFs && !U->usesScalars(IV)) + continue; + for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) { + if (U->getOperand(I) != IV) + continue; + U->setOperand(I, Steps); + } + } + } +} + +void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan &Plan) { + DenseMap<const SCEV *, VPValue *> SCEV2VPV; + + for (VPRecipeBase &R : + make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) { + auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R); + if (!ExpR) + continue; + + auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR}); + if (I.second) + continue; + ExpR->replaceAllUsesWith(I.first->second); + ExpR->eraseFromParent(); + } +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index e74409a86466..3372e255dff7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -14,8 +14,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/ADT/STLFunctionalExtras.h" namespace llvm { @@ -23,6 +22,7 @@ class InductionDescriptor; class Instruction; class PHINode; class ScalarEvolution; +class Loop; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding @@ -49,6 +49,18 @@ struct VPlanTransforms { /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV /// recipe, if it exists. static void removeRedundantCanonicalIVs(VPlan &Plan); + + static void removeDeadRecipes(VPlan &Plan); + + /// If any user of a VPWidenIntOrFpInductionRecipe needs scalar values, + /// provide them by building scalar steps off of the canonical scalar IV and + /// update the original IV's users. This is an optional optimization to reduce + /// the needs of vector extracts. + static void optimizeInductions(VPlan &Plan, ScalarEvolution &SE); + + /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing + /// them with already existing recipes expanding the same SCEV expression. + static void removeRedundantExpandSCEVRecipes(VPlan &Plan); }; } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5296d2b9485c..5fc676834331 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -106,6 +106,7 @@ public: VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, + VPVWidenPointerInductionSC, VPVPredInstPHI, VPVReductionPHISC, }; @@ -207,9 +208,7 @@ public: /// Subclass identifier (for isa/dyn_cast). enum class VPUserID { Recipe, - // TODO: Currently VPUsers are used in VPBlockBase, but in the future the - // only VPUsers should either be recipes or live-outs. - Block + LiveOut, }; private: @@ -286,6 +285,22 @@ public: /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPDef *Recipe); + + /// Returns true if the VPUser uses scalars of operand \p Op. Conservatively + /// returns if only first (scalar) lane is used, as default. + virtual bool usesScalars(const VPValue *Op) const { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return onlyFirstLaneUsed(Op); + } + + /// Returns true if the VPUser only uses the first lane of operand \p Op. + /// Conservatively returns false. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return false; + } }; /// This class augments a recipe with a set of VPValues defined by the recipe. @@ -327,10 +342,12 @@ public: /// type identification. using VPRecipeTy = enum { VPBranchOnMaskSC, + VPExpandSCEVSC, VPInstructionSC, VPInterleaveSC, VPReductionSC, VPReplicateSC, + VPScalarIVStepsSC, VPWidenCallSC, VPWidenCanonicalIVSC, VPWidenGEPSC, @@ -344,6 +361,7 @@ public: VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, + VPWidenPointerInductionSC, VPPredInstPHISC, VPReductionPHISC, VPFirstPHISC = VPBlendSC, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index d36f250995e1..f917883145c0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -43,17 +43,20 @@ static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) { /// \p Region. Checks in this function are generic for VPBlockBases. They are /// not specific for VPBasicBlocks or VPRegionBlocks. static void verifyBlocksInRegion(const VPRegionBlock *Region) { - for (const VPBlockBase *VPB : - make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()), - df_iterator<const VPBlockBase *>::end(Region->getExit()))) { + for (const VPBlockBase *VPB : make_range( + df_iterator<const VPBlockBase *>::begin(Region->getEntry()), + df_iterator<const VPBlockBase *>::end(Region->getExiting()))) { // Check block's parent. assert(VPB->getParent() == Region && "VPBlockBase has wrong parent"); + auto *VPBB = dyn_cast<VPBasicBlock>(VPB); // Check block's condition bit. - if (VPB->getNumSuccessors() > 1) - assert(VPB->getCondBit() && "Missing condition bit!"); + if (VPB->getNumSuccessors() > 1 || (VPBB && VPBB->isExiting())) + assert(VPBB && VPBB->getTerminator() && + "Block has multiple successors but doesn't " + "have a proper branch recipe!"); else - assert(!VPB->getCondBit() && "Unexpected condition bit!"); + assert((!VPBB || !VPBB->getTerminator()) && "Unexpected branch recipe!"); // Check block's successors. const auto &Successors = VPB->getSuccessors(); @@ -94,13 +97,14 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { /// VPBlockBases. Do not recurse inside nested VPRegionBlocks. static void verifyRegion(const VPRegionBlock *Region) { const VPBlockBase *Entry = Region->getEntry(); - const VPBlockBase *Exit = Region->getExit(); + const VPBlockBase *Exiting = Region->getExiting(); - // Entry and Exit shouldn't have any predecessor/successor, respectively. + // Entry and Exiting shouldn't have any predecessor/successor, respectively. assert(!Entry->getNumPredecessors() && "Region entry has predecessors."); - assert(!Exit->getNumSuccessors() && "Region exit has successors."); + assert(!Exiting->getNumSuccessors() && + "Region exiting block has successors."); (void)Entry; - (void)Exit; + (void)Exiting; verifyBlocksInRegion(Region); } @@ -111,9 +115,9 @@ static void verifyRegionRec(const VPRegionBlock *Region) { verifyRegion(Region); // Recurse inside nested regions. - for (const VPBlockBase *VPB : - make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()), - df_iterator<const VPBlockBase *>::end(Region->getExit()))) { + for (const VPBlockBase *VPB : make_range( + df_iterator<const VPBlockBase *>::begin(Region->getEntry()), + df_iterator<const VPBlockBase *>::end(Region->getExiting()))) { if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB)) verifyRegionRec(SubRegion); } @@ -157,7 +161,7 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { } } - const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry()); + const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry()); if (!Entry) { errs() << "VPlan entry block is not a VPBasicBlock\n"; @@ -170,19 +174,19 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { return false; } - const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit()); - if (!Exit) { - errs() << "VPlan exit block is not a VPBasicBlock\n"; + const VPBasicBlock *Exiting = dyn_cast<VPBasicBlock>(TopRegion->getExiting()); + if (!Exiting) { + errs() << "VPlan exiting block is not a VPBasicBlock\n"; return false; } - if (Exit->empty()) { - errs() << "VPlan vector loop exit must end with BranchOnCount " + if (Exiting->empty()) { + errs() << "VPlan vector loop exiting block must end with BranchOnCount " "VPInstruction but is empty\n"; return false; } - auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end())); + auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end())); if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { errs() << "VPlan vector loop exit must end with BranchOnCount " "VPInstruction\n"; @@ -197,10 +201,17 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { errs() << "region entry block has predecessors\n"; return false; } - if (Region->getExit()->getNumSuccessors() != 0) { - errs() << "region exit block has successors\n"; + if (Region->getExiting()->getNumSuccessors() != 0) { + errs() << "region exiting block has successors\n"; return false; } } + + for (auto &KV : Plan.getLiveOuts()) + if (KV.second->getNumOperands() != 1) { + errs() << "live outs must have a single operand\n"; + return false; + } + return true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 258f6c67e54d..90598937affc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -103,11 +103,13 @@ private: bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); + bool foldShuffleFromReductions(Instruction &I); + bool foldSelectShuffle(Instruction &I, bool FromReduction = false); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); - New.takeName(&Old); if (auto *NewI = dyn_cast<Instruction>(&New)) { + New.takeName(&Old); Worklist.pushUsersToWorkList(*NewI); Worklist.pushValue(NewI); } @@ -255,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { ExtractElementInst *VectorCombine::getShuffleExtract( ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex = InvalidIndex) const { - assert(isa<ConstantInt>(Ext0->getIndexOperand()) && - isa<ConstantInt>(Ext1->getIndexOperand()) && - "Expected constant extract indexes"); + auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); + auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); + assert(Index0C && Index1C && "Expected constant extract indexes"); - unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue(); - unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue(); + unsigned Index0 = Index0C->getZExtValue(); + unsigned Index1 = Index1C->getZExtValue(); // If the extract indexes are identical, no shuffle is needed. if (Index0 == Index1) @@ -306,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex) { - assert(isa<ConstantInt>(Ext0->getOperand(1)) && - isa<ConstantInt>(Ext1->getOperand(1)) && - "Expected constant extract indexes"); + auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); + auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); + assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); + unsigned Opcode = I.getOpcode(); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); @@ -331,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // Get cost estimates for the extract elements. These costs will factor into // both sequences. - unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); - unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); + unsigned Ext0Index = Ext0IndexC->getZExtValue(); + unsigned Ext1Index = Ext1IndexC->getZExtValue(); InstructionCost Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); @@ -694,8 +697,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { ScalarInst->copyIRFlags(&I); // Fold the vector constants in the original vectors into a new base vector. - Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1) - : ConstantExpr::get(Opcode, VecC0, VecC1); + Value *NewVecC = + IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) + : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); replaceValue(I, *Insert); return true; @@ -1015,12 +1019,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return false; NumInstChecked++; } - } - - if (!LastCheckedInst) - LastCheckedInst = UI; - else if (LastCheckedInst->comesBefore(UI)) LastCheckedInst = UI; + } auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT); if (!ScalarIdx.isSafe()) { @@ -1117,6 +1117,339 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { return true; } +/// Given a commutative reduction, the order of the input lanes does not alter +/// the results. We can use this to remove certain shuffles feeding the +/// reduction, removing the need to shuffle at all. +bool VectorCombine::foldShuffleFromReductions(Instruction &I) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + break; + default: + return false; + } + + // Find all the inputs when looking through operations that do not alter the + // lane order (binops, for example). Currently we look for a single shuffle, + // and can ignore splat values. + std::queue<Value *> Worklist; + SmallPtrSet<Value *, 4> Visited; + ShuffleVectorInst *Shuffle = nullptr; + if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) + Worklist.push(Op); + + while (!Worklist.empty()) { + Value *CV = Worklist.front(); + Worklist.pop(); + if (Visited.contains(CV)) + continue; + + // Splats don't change the order, so can be safely ignored. + if (isSplatValue(CV)) + continue; + + Visited.insert(CV); + + if (auto *CI = dyn_cast<Instruction>(CV)) { + if (CI->isBinaryOp()) { + for (auto *Op : CI->operand_values()) + Worklist.push(Op); + continue; + } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { + if (Shuffle && Shuffle != SV) + return false; + Shuffle = SV; + continue; + } + } + + // Anything else is currently an unknown node. + return false; + } + + if (!Shuffle) + return false; + + // Check all uses of the binary ops and shuffles are also included in the + // lane-invariant operations (Visited should be the list of lanewise + // instructions, including the shuffle that we found). + for (auto *V : Visited) + for (auto *U : V->users()) + if (!Visited.contains(U) && U != &I) + return false; + + FixedVectorType *VecType = + dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); + if (!VecType) + return false; + FixedVectorType *ShuffleInputType = + dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); + if (!ShuffleInputType) + return false; + int NumInputElts = ShuffleInputType->getNumElements(); + + // Find the mask from sorting the lanes into order. This is most likely to + // become a identity or concat mask. Undef elements are pushed to the end. + SmallVector<int> ConcatMask; + Shuffle->getShuffleMask(ConcatMask); + sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); + bool UsesSecondVec = + any_of(ConcatMask, [&](int M) { return M >= NumInputElts; }); + InstructionCost OldCost = TTI.getShuffleCost( + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, + Shuffle->getShuffleMask()); + InstructionCost NewCost = TTI.getShuffleCost( + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, + ConcatMask); + + LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle + << "\n"); + LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + if (NewCost < OldCost) { + Builder.SetInsertPoint(Shuffle); + Value *NewShuffle = Builder.CreateShuffleVector( + Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); + LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); + replaceValue(*Shuffle, *NewShuffle); + } + + // See if we can re-use foldSelectShuffle, getting it to reduce the size of + // the shuffle into a nicer order, as it can ignore the order of the shuffles. + return foldSelectShuffle(*Shuffle, true); +} + +/// This method looks for groups of shuffles acting on binops, of the form: +/// %x = shuffle ... +/// %y = shuffle ... +/// %a = binop %x, %y +/// %b = binop %x, %y +/// shuffle %a, %b, selectmask +/// We may, especially if the shuffle is wider than legal, be able to convert +/// the shuffle to a form where only parts of a and b need to be computed. On +/// architectures with no obvious "select" shuffle, this can reduce the total +/// number of operations if the target reports them as cheaper. +bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { + auto *SVI = dyn_cast<ShuffleVectorInst>(&I); + auto *VT = dyn_cast<FixedVectorType>(I.getType()); + if (!SVI || !VT) + return false; + auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); + auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); + if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || + VT != Op0->getType()) + return false; + auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0)); + auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1)); + auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0)); + auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1)); + auto checkSVNonOpUses = [&](Instruction *I) { + if (!I || I->getOperand(0)->getType() != VT) + return true; + return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); + }; + if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || + checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) + return false; + + // Collect all the uses that are shuffles that we can transform together. We + // may not have a single shuffle, but a group that can all be transformed + // together profitably. + SmallVector<ShuffleVectorInst *> Shuffles; + auto collectShuffles = [&](Instruction *I) { + for (auto *U : I->users()) { + auto *SV = dyn_cast<ShuffleVectorInst>(U); + if (!SV || SV->getType() != VT) + return false; + if (!llvm::is_contained(Shuffles, SV)) + Shuffles.push_back(SV); + } + return true; + }; + if (!collectShuffles(Op0) || !collectShuffles(Op1)) + return false; + // From a reduction, we need to be processing a single shuffle, otherwise the + // other uses will not be lane-invariant. + if (FromReduction && Shuffles.size() > 1) + return false; + + // For each of the output shuffles, we try to sort all the first vector + // elements to the beginning, followed by the second array elements at the + // end. If the binops are legalized to smaller vectors, this may reduce total + // number of binops. We compute the ReconstructMask mask needed to convert + // back to the original lane order. + SmallVector<int> V1, V2; + SmallVector<SmallVector<int>> ReconstructMasks; + int MaxV1Elt = 0, MaxV2Elt = 0; + unsigned NumElts = VT->getNumElements(); + for (ShuffleVectorInst *SVN : Shuffles) { + SmallVector<int> Mask; + SVN->getShuffleMask(Mask); + + // Check the operands are the same as the original, or reversed (in which + // case we need to commute the mask). + Value *SVOp0 = SVN->getOperand(0); + Value *SVOp1 = SVN->getOperand(1); + if (SVOp0 == Op1 && SVOp1 == Op0) { + std::swap(SVOp0, SVOp1); + ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); + } + if (SVOp0 != Op0 || SVOp1 != Op1) + return false; + + // Calculate the reconstruction mask for this shuffle, as the mask needed to + // take the packed values from Op0/Op1 and reconstructing to the original + // order. + SmallVector<int> ReconstructMask; + for (unsigned I = 0; I < Mask.size(); I++) { + if (Mask[I] < 0) { + ReconstructMask.push_back(-1); + } else if (Mask[I] < static_cast<int>(NumElts)) { + MaxV1Elt = std::max(MaxV1Elt, Mask[I]); + auto It = find(V1, Mask[I]); + if (It != V1.end()) + ReconstructMask.push_back(It - V1.begin()); + else { + ReconstructMask.push_back(V1.size()); + V1.push_back(Mask[I]); + } + } else { + MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); + auto It = find(V2, Mask[I] - NumElts); + if (It != V2.end()) + ReconstructMask.push_back(NumElts + It - V2.begin()); + else { + ReconstructMask.push_back(NumElts + V2.size()); + V2.push_back(Mask[I] - NumElts); + } + } + } + + // For reductions, we know that the lane ordering out doesn't alter the + // result. In-order can help simplify the shuffle away. + if (FromReduction) + sort(ReconstructMask); + ReconstructMasks.push_back(ReconstructMask); + } + + // If the Maximum element used from V1 and V2 are not larger than the new + // vectors, the vectors are already packes and performing the optimization + // again will likely not help any further. This also prevents us from getting + // stuck in a cycle in case the costs do not also rule it out. + if (V1.empty() || V2.empty() || + (MaxV1Elt == static_cast<int>(V1.size()) - 1 && + MaxV2Elt == static_cast<int>(V2.size()) - 1)) + return false; + + // Calculate the masks needed for the new input shuffles, which get padded + // with undef + SmallVector<int> V1A, V1B, V2A, V2B; + for (unsigned I = 0; I < V1.size(); I++) { + V1A.push_back(SVI0A->getMaskValue(V1[I])); + V1B.push_back(SVI0B->getMaskValue(V1[I])); + } + for (unsigned I = 0; I < V2.size(); I++) { + V2A.push_back(SVI1A->getMaskValue(V2[I])); + V2B.push_back(SVI1B->getMaskValue(V2[I])); + } + while (V1A.size() < NumElts) { + V1A.push_back(UndefMaskElem); + V1B.push_back(UndefMaskElem); + } + while (V2A.size() < NumElts) { + V2A.push_back(UndefMaskElem); + V2B.push_back(UndefMaskElem); + } + + auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { + return C + + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); + }; + auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { + return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); + }; + + // Get the costs of the shuffles + binops before and after with the new + // shuffle masks. + InstructionCost CostBefore = + TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); + CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), + InstructionCost(0), AddShuffleCost); + // This set helps us only cost each unique shuffle once. + SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles( + {SVI0A, SVI0B, SVI1A, SVI1B}); + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleCost); + + // The new binops will be unused for lanes past the used shuffle lengths. + // These types attempt to get the correct cost for that from the target. + FixedVectorType *Op0SmallVT = + FixedVectorType::get(VT->getScalarType(), V1.size()); + FixedVectorType *Op1SmallVT = + FixedVectorType::get(VT->getScalarType(), V2.size()); + InstructionCost CostAfter = + TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); + CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), + InstructionCost(0), AddShuffleMaskCost); + std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); + CostAfter += + std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), + InstructionCost(0), AddShuffleMaskCost); + + if (CostBefore <= CostAfter) + return false; + + // The cost model has passed, create the new instructions. + Builder.SetInsertPoint(SVI0A); + Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), + SVI0A->getOperand(1), V1A); + Builder.SetInsertPoint(SVI0B); + Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), + SVI0B->getOperand(1), V1B); + Builder.SetInsertPoint(SVI1A); + Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), + SVI1A->getOperand(1), V2A); + Builder.SetInsertPoint(SVI1B); + Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), + SVI1B->getOperand(1), V2B); + Builder.SetInsertPoint(Op0); + Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), + NSV0A, NSV0B); + if (auto *I = dyn_cast<Instruction>(NOp0)) + I->copyIRFlags(Op0, true); + Builder.SetInsertPoint(Op1); + Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), + NSV1A, NSV1B); + if (auto *I = dyn_cast<Instruction>(NOp1)) + I->copyIRFlags(Op1, true); + + for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { + Builder.SetInsertPoint(Shuffles[S]); + Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); + replaceValue(*Shuffles[S], *NSV); + } + + Worklist.pushValue(NSV0A); + Worklist.pushValue(NSV0B); + Worklist.pushValue(NSV1A); + Worklist.pushValue(NSV1B); + for (auto *S : Shuffles) + Worklist.add(S); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -1136,6 +1469,8 @@ bool VectorCombine::run() { MadeChange |= foldBitcastShuf(I); MadeChange |= foldExtractedCmps(I); MadeChange |= foldShuffleOfBinops(I); + MadeChange |= foldShuffleFromReductions(I); + MadeChange |= foldSelectShuffle(I); } MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= scalarizeLoadExtract(I); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp index 010ca28fc237..208e5eeea864 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -15,7 +15,6 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Vectorize.h" -#include "llvm/Analysis/Passes.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" |