diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
| -rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 84 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2577 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 2302 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlan.cpp | 557 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlan.h | 1194 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlanBuilder.h | 61 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlanValue.h | 146 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 1 |
9 files changed, 5312 insertions, 1611 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 1aea73cd4a32..7622ed6d194f 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMVectorize LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp + VPlan.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 9cf66382b581..dc83b6d4d292 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1,4 +1,4 @@ -//===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===// +//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// // // The LLVM Compiler Infrastructure // @@ -6,47 +6,67 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/Triple.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" +#include <algorithm> +#include <cassert> +#include <cstdlib> +#include <tuple> +#include <utility> using namespace llvm; #define DEBUG_TYPE "load-store-vectorizer" + STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); -namespace { - // FIXME: Assuming stack alignment of 4 is always good enough static const unsigned StackAdjustedAlignment = 4; -typedef SmallVector<Instruction *, 8> InstrList; -typedef MapVector<Value *, InstrList> InstrListMap; + +namespace { + +using InstrList = SmallVector<Instruction *, 8>; +using InstrListMap = MapVector<Value *, InstrList>; class Vectorizer { Function &F; @@ -163,7 +183,10 @@ public: AU.setPreservesCFG(); } }; -} + +} // end anonymous namespace + +char LoadStoreVectorizer::ID = 0; INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) @@ -175,8 +198,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE, "Vectorize load and store instructions", false, false) -char LoadStoreVectorizer::ID = 0; - Pass *llvm::createLoadStoreVectorizerPass() { return new LoadStoreVectorizer(); } @@ -480,6 +501,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { MemoryInstrs.push_back(&I); else ChainInstrs.push_back(&I); + } else if (isa<IntrinsicInst>(&I) && + cast<IntrinsicInst>(&I)->getIntrinsicID() == + Intrinsic::sideeffect) { + // Ignore llvm.sideeffect calls. } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); break; @@ -593,7 +618,14 @@ Vectorizer::collectInstructions(BasicBlock *BB) { // Skip weird non-byte sizes. They probably aren't worth the effort of // handling correctly. unsigned TySize = DL.getTypeSizeInBits(Ty); - if (TySize < 8) + if ((TySize % 8) != 0) + continue; + + // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain + // functions are currently using an integer type for the vectorized + // load/store, and does not support casting between the integer type and a + // vector of pointers (e.g. i64 to <2 x i16*>) + if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) continue; Value *Ptr = LI->getPointerOperand(); @@ -605,7 +637,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Make sure all the users of a vector are constant-index extracts. - if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) { + if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); return EEI && isa<ConstantInt>(EEI->getOperand(1)); })) @@ -614,7 +646,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) { // Save the load locations. Value *ObjPtr = GetUnderlyingObject(Ptr, DL); LoadRefs[ObjPtr].push_back(LI); - } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (!SI->isSimple()) continue; @@ -627,19 +658,28 @@ Vectorizer::collectInstructions(BasicBlock *BB) { if (!VectorType::isValidElementType(Ty->getScalarType())) continue; + // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain + // functions are currently using an integer type for the vectorized + // load/store, and does not support casting between the integer type and a + // vector of pointers (e.g. i64 to <2 x i16*>) + if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) + continue; + // Skip weird non-byte sizes. They probably aren't worth the effort of // handling correctly. unsigned TySize = DL.getTypeSizeInBits(Ty); - if (TySize < 8) + if ((TySize % 8) != 0) continue; Value *Ptr = SI->getPointerOperand(); unsigned AS = Ptr->getType()->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + + // No point in looking at these if they're too big to vectorize. if (TySize > VecRegSize / 2) continue; - if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) { + if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); return EEI && isa<ConstantInt>(EEI->getOperand(1)); })) @@ -680,8 +720,8 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { SmallVector<int, 16> Heads, Tails; int ConsecutiveChain[64]; - // Do a quadratic search on all of the given stores and find all of the pairs - // of stores that follow each other. + // Do a quadratic search on all of the given loads/stores and find all of the + // pairs of loads/stores that follow each other. for (int i = 0, e = Instrs.size(); i < e; ++i) { ConsecutiveChain[i] = -1; for (int j = e - 1; j >= 0; --j) { @@ -748,7 +788,7 @@ bool Vectorizer::vectorizeStoreChain( SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { StoreInst *S0 = cast<StoreInst>(Chain[0]); - // If the vector has an int element, default to int for the whole load. + // If the vector has an int element, default to int for the whole store. Type *StoreTy; for (Instruction *I : Chain) { StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 012b10c8a9b0..fbcdc0df0f1c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,62 +47,97 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" +#include "VPlan.h" +#include "VPlanBuilder.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" -#include "llvm/Transforms/Vectorize.h" #include <algorithm> -#include <map> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <functional> +#include <iterator> +#include <limits> +#include <memory> +#include <string> #include <tuple> +#include <utility> +#include <vector> using namespace llvm; -using namespace llvm::PatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -245,12 +280,12 @@ createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, namespace { -// Forward declarations. -class LoopVectorizeHints; class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; +} // end anonymous namespace + /// Returns true if the given loop body has a cycle, excluding the loop /// itself. static bool hasCyclesInLoopBody(const Loop &L) { @@ -324,7 +359,6 @@ static unsigned getMemInstAddressSpace(Value *I) { /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { - // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a <VF x Ty> vector. if (VF > 1) { @@ -349,7 +383,7 @@ static unsigned getReciprocalPredBlockProb() { return 2; } static Value *addFastMathFlag(Value *V) { if (isa<FPMathOperator>(V)) { FastMathFlags Flags; - Flags.setUnsafeAlgebra(); + Flags.setFast(); cast<Instruction>(V)->setFastMathFlags(Flags); } return V; @@ -362,6 +396,8 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { : ConstantFP::get(Ty, C); } +namespace llvm { + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -387,16 +423,16 @@ public: LoopVectorizationCostModel *CM) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), - Builder(PSE.getSE()->getContext()), Induction(nullptr), - OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), - TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), - AddedSafetyChecks(false) {} + Builder(PSE.getSE()->getContext()), + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} + virtual ~InnerLoopVectorizer() = default; /// Create a new empty loop. Unlink the old loop and connect the new one. - void createVectorizedLoopSkeleton(); + /// Return the pre-header block of the new loop. + BasicBlock *createVectorizedLoopSkeleton(); - /// Vectorize a single instruction within the innermost loop. - void vectorizeInstruction(Instruction &I); + /// Widen a single instruction within the innermost loop. + void widenInstruction(Instruction &I); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -404,28 +440,83 @@ public: // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } - virtual ~InnerLoopVectorizer() {} - -protected: - /// A small list of PHINodes. - typedef SmallVector<PHINode *, 4> PhiVector; - /// A type for vectorized values in the new loop. Each value from the /// original loop, when vectorized, is represented by UF vector values in the /// new unrolled loop, where UF is the unroll factor. - typedef SmallVector<Value *, 2> VectorParts; + using VectorParts = SmallVector<Value *, 2>; + + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + + /// A helper function to scalarize a single Instruction in the innermost loop. + /// Generates a sequence of scalar instances for each lane between \p MinLane + /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, + /// inclusive.. + void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, + bool IfPredicateInstr); + + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + + /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a + /// vector or scalar value on-demand if one is not yet available. When + /// vectorizing a loop, we visit the definition of an instruction before its + /// uses. When visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. + /// + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part. If the value has already been vectorized, + /// the corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// a new vector value on-demand by inserting the scalar values into a vector + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into a vector. + Value *getOrCreateVectorValue(Value *V, unsigned Part); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll and vector indices \p Instance. If the value has been + /// vectorized but not scalarized, the necessary extractelement instruction + /// will be generated. + Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); + + /// Construct the vector value of a scalarized value \p V one lane at a time. + void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); + + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + + /// Vectorize Load and Store instructions, optionally masking the vector + /// operations if \p BlockInMask is non-null. + void vectorizeMemoryInstruction(Instruction *Instr, + VectorParts *BlockInMask = nullptr); + + /// \brief Set the debug location in the builder using the debug location in + /// the instruction. + void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + +protected: + friend class LoopVectorizationPlanner; + + /// A small list of PHINodes. + using PhiVector = SmallVector<PHINode *, 4>; /// A type for scalarized values in the new loop. Each value from the /// original loop, when scalarized, is represented by UF x VF scalar values /// in the new unrolled loop, where UF is the unroll factor and VF is the /// vectorization factor. - typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts; - - // When we if-convert we need to create edge masks. We have to cache values - // so that we don't end up with exponential recursion/IR. - typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts> - EdgeMaskCacheTy; - typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy; + using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, @@ -457,40 +548,14 @@ protected: /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Predicate conditional instructions that require predication on their - /// respective conditions. - void predicateInstructions(); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VectorParts createBlockInMask(BasicBlock *BB); - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - - /// Vectorize a single PHINode in a block. This method handles the induction - /// variable canonicalization. It supports both VF = 1 for unrolled loops and - /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); - /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); - /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each - /// scalarized instruction behind an if block predicated on the control - /// dependence of the instruction. - void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false); - - /// Vectorize Load and Store instructions, - virtual void vectorizeMemoryInstruction(Instruction *Instr); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -521,11 +586,6 @@ protected: void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal); - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); - /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. bool shouldScalarizeInstruction(Instruction *I) const; @@ -533,37 +593,19 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a - /// vector or scalar value on-demand if one is not yet available. When - /// vectorizing a loop, we visit the definition of an instruction before its - /// uses. When visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. - /// - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part. If the value has already been vectorized, - /// the corresponding vector entry in VectorLoopValueMap is returned. If, - /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// a new vector value on-demand by inserting the scalar values into a vector - /// with an insertelement sequence. If the value has been neither vectorized - /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into a vector. - Value *getOrCreateVectorValue(Value *V, unsigned Part); - - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part and vector index \p Lane. If the value has - /// been vectorized but not scalarized, the necessary extractelement - /// instruction will be generated. - Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); - - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); + /// If there is a cast involved in the induction variable \p ID, which should + /// be ignored in the vectorized loop body, this function records the + /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the + /// cast. We had already proved that the casted Phi is equal to the uncasted + /// Phi in the vectorized loop (under a runtime guard), and therefore + /// there is no need to vectorize the cast - the same value can be used in the + /// vector loop for both the Phi and the cast. + /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, + /// Otherwise, \p VectorLoopValue is a widened/vectorized value. + void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, + Value *VectorLoopValue, + unsigned Part, + unsigned Lane = UINT_MAX); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -574,12 +616,19 @@ protected: /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(Loop *NewLoop); + /// Returns a bitcasted value to the requested vector type. + /// Also handles bitcasts of vector<float> <-> vector<pointer> types. + Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, + const DataLayout &DL); + /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check any memory assumptions we may have made. void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); @@ -601,149 +650,32 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); - /// \brief Set the debug location in the builder using the debug location in - /// the instruction. - void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); - - /// This is a helper class for maintaining vectorization state. It's used for - /// mapping values from the original loop to their corresponding values in - /// the new loop. Two mappings are maintained: one for vectorized values and - /// one for scalarized values. Vectorized values are represented with UF - /// vector values in the new loop, and scalarized values are represented with - /// UF x VF scalar values in the new loop. UF and VF are the unroll and - /// vectorization factors, respectively. - /// - /// Entries can be added to either map with setVectorValue and setScalarValue, - /// which assert that an entry was not already added before. If an entry is to - /// replace an existing one, call resetVectorValue. This is currently needed - /// to modify the mapped values during "fix-up" operations that occur once the - /// first phase of widening is complete. These operations include type - /// truncation and the second phase of recurrence widening. - /// - /// Entries from either map can be retrieved using the getVectorValue and - /// getScalarValue functions, which assert that the desired value exists. - - struct ValueMap { - - /// Construct an empty map with the given unroll and vectorization factors. - ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} - - /// \return True if the map has any vector entry for \p Key. - bool hasAnyVectorValue(Value *Key) const { - return VectorMapStorage.count(Key); - } - - /// \return True if the map has a vector entry for \p Key and \p Part. - bool hasVectorValue(Value *Key, unsigned Part) const { - assert(Part < UF && "Queried Vector Part is too large."); - if (!hasAnyVectorValue(Key)) - return false; - const VectorParts &Entry = VectorMapStorage.find(Key)->second; - assert(Entry.size() == UF && "VectorParts has wrong dimensions."); - return Entry[Part] != nullptr; - } - - /// \return True if the map has any scalar entry for \p Key. - bool hasAnyScalarValue(Value *Key) const { - return ScalarMapStorage.count(Key); - } - - /// \return True if the map has a scalar entry for \p Key, \p Part and - /// \p Part. - bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { - assert(Part < UF && "Queried Scalar Part is too large."); - assert(Lane < VF && "Queried Scalar Lane is too large."); - if (!hasAnyScalarValue(Key)) - return false; - const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; - assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); - return Entry[Part][Lane] != nullptr; - } - - /// Retrieve the existing vector value that corresponds to \p Key and - /// \p Part. - Value *getVectorValue(Value *Key, unsigned Part) { - assert(hasVectorValue(Key, Part) && "Getting non-existent value."); - return VectorMapStorage[Key][Part]; - } - - /// Retrieve the existing scalar value that corresponds to \p Key, \p Part - /// and \p Lane. - Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { - assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); - return ScalarMapStorage[Key][Part][Lane]; - } - - /// Set a vector value associated with \p Key and \p Part. Assumes such a - /// value is not already set. If it is, use resetVectorValue() instead. - void setVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); - if (!VectorMapStorage.count(Key)) { - VectorParts Entry(UF); - VectorMapStorage[Key] = Entry; - } - VectorMapStorage[Key][Part] = Vector; - } - - /// Set a scalar value associated with \p Key for \p Part and \p Lane. - /// Assumes such a value is not already set. - void setScalarValue(Value *Key, unsigned Part, unsigned Lane, - Value *Scalar) { - assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); - if (!ScalarMapStorage.count(Key)) { - ScalarParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); - // TODO: Consider storing uniform values only per-part, as they occupy - // lane 0 only, keeping the other VF-1 redundant entries null. - ScalarMapStorage[Key] = Entry; - } - ScalarMapStorage[Key][Part][Lane] = Scalar; - } - - /// Reset the vector value associated with \p Key for the given \p Part. - /// This function can be used to update values that have already been - /// vectorized. This is the case for "fix-up" operations including type - /// truncation and the second phase of recurrence vectorization. - void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(hasVectorValue(Key, Part) && "Vector value not set for part"); - VectorMapStorage[Key][Part] = Vector; - } - - private: - /// The unroll factor. Each entry in the vector map contains UF vector - /// values. - unsigned UF; - - /// The vectorization factor. Each entry in the scalar map contains UF x VF - /// scalar values. - unsigned VF; - - /// The vector and scalar map storage. We use std::map and not DenseMap - /// because insertions to DenseMap invalidate its iterators. - std::map<Value *, VectorParts> VectorMapStorage; - std::map<Value *, ScalarParts> ScalarMapStorage; - }; - /// The original loop. Loop *OrigLoop; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies /// dynamic knowledge to simplify SCEV expressions and converts them to a /// more usable form. PredicatedScalarEvolution &PSE; + /// Loop Info. LoopInfo *LI; + /// Dominator Tree. DominatorTree *DT; + /// Alias Analysis. AliasAnalysis *AA; + /// Target Library Info. const TargetLibraryInfo *TLI; + /// Target Transform Info. const TargetTransformInfo *TTI; + /// Assumption Cache. AssumptionCache *AC; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; @@ -758,7 +690,6 @@ protected: /// vector elements. unsigned VF; -protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -770,39 +701,45 @@ protected: /// The vector-loop preheader. BasicBlock *LoopVectorPreHeader; + /// The scalar-loop preheader. BasicBlock *LoopScalarPreHeader; + /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; + /// The ExitBlock of the scalar loop. BasicBlock *LoopExitBlock; + /// The vector loop body. BasicBlock *LoopVectorBody; + /// The scalar loop body. BasicBlock *LoopScalarBody; + /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector<BasicBlock *, 4> LoopBypassBlocks; /// The new Induction variable which was added to the new block. - PHINode *Induction; + PHINode *Induction = nullptr; + /// The induction variable of the old basic block. - PHINode *OldInduction; + PHINode *OldInduction = nullptr; /// Maps values from the original loop to their corresponding values in the /// vectorized loop. A key value can map to either vector values, scalar /// values or both kinds of values, depending on whether the key was /// vectorized and scalarized. - ValueMap VectorLoopValueMap; + VectorizerValueMap VectorLoopValueMap; + + /// Store instructions that were predicated. + SmallVector<Instruction *, 4> PredicatedInstructions; - /// Store instructions that should be predicated, as a pair - /// <StoreInst, Predicate> - SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions; - EdgeMaskCacheTy EdgeMaskCache; - BlockMaskCacheTy BlockMaskCache; /// Trip count of the original loop. - Value *TripCount; + Value *TripCount = nullptr; + /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) - Value *VectorTripCount; + Value *VectorTripCount = nullptr; /// The legality analysis. LoopVectorizationLegality *Legal; @@ -811,7 +748,7 @@ protected: LoopVectorizationCostModel *Cost; // Record whether runtime checks are added. - bool AddedSafetyChecks; + bool AddedSafetyChecks = false; // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. @@ -831,7 +768,6 @@ public: UnrollFactor, LVL, CM) {} private: - void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode = @@ -839,6 +775,8 @@ private: Value *reverseVector(Value *Vec) override; }; +} // end namespace llvm + /// \brief Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { @@ -861,7 +799,8 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { const DILocation *DIL = Inst->getDebugLoc(); - if (DIL && Inst->getFunction()->isDebugInfoForProfiling()) + if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && + !isa<DbgInfoIntrinsic>(Inst)) B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF)); else B.SetCurrentDebugLocation(DIL); @@ -908,6 +847,8 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, } } +namespace llvm { + /// \brief The group of interleaved loads/stores sharing the same stride and /// close to each other. /// @@ -937,7 +878,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, class InterleaveGroup { public: InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) - : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { + : Align(Align), InsertPos(Instr) { assert(Align && "The alignment should be non-zero"); Factor = std::abs(Stride); @@ -1010,13 +951,26 @@ public: Instruction *getInsertPos() const { return InsertPos; } void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + /// Add metadata (e.g. alias info) from the instructions in this group to \p + /// NewInst. + /// + /// FIXME: this function currently does not add noalias metadata a'la + /// addNewMedata. To do that we need to compute the intersection of the + /// noalias info from all members. + void addMetadata(Instruction *NewInst) const { + SmallVector<Value *, 4> VL; + std::transform(Members.begin(), Members.end(), std::back_inserter(VL), + [](std::pair<int, Instruction *> p) { return p.second; }); + propagateMetadata(NewInst, VL); + } + private: unsigned Factor; // Interleave Factor. bool Reverse; unsigned Align; DenseMap<int, Instruction *> Members; - int SmallestKey; - int LargestKey; + int SmallestKey = 0; + int LargestKey = 0; // To avoid breaking dependences, vectorized instructions of an interleave // group should be inserted at either the first load or the last store in @@ -1031,6 +985,9 @@ private: // store i32 %odd // Insert Position Instruction *InsertPos; }; +} // end namespace llvm + +namespace { /// \brief Drive the analysis of interleaved memory accesses in the loop. /// @@ -1044,8 +1001,7 @@ class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT, LoopInfo *LI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr), - RequiresScalarEpilogue(false) {} + : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {} ~InterleavedAccessInfo() { SmallSet<InterleaveGroup *, 4> DelSet; @@ -1065,14 +1021,6 @@ public: return InterleaveGroupMap.count(Instr); } - /// \brief Return the maximum interleave factor of all interleaved groups. - unsigned getMaxInterleaveFactor() const { - unsigned MaxFactor = 1; - for (auto &Entry : InterleaveGroupMap) - MaxFactor = std::max(MaxFactor, Entry.second->getFactor()); - return MaxFactor; - } - /// \brief Get the interleave group that \p Instr belongs to. /// /// \returns nullptr if doesn't have such group. @@ -1095,15 +1043,16 @@ private: /// The interleaved access analysis can also add new predicates (for example /// by versioning strides of pointers). PredicatedScalarEvolution &PSE; + Loop *TheLoop; DominatorTree *DT; LoopInfo *LI; - const LoopAccessInfo *LAI; + const LoopAccessInfo *LAI = nullptr; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory /// out-of-bounds by executing at least one scalar epilogue iteration. - bool RequiresScalarEpilogue; + bool RequiresScalarEpilogue = false; /// Holds the relationships between the members and the interleave group. DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; @@ -1114,21 +1063,26 @@ private: /// \brief The descriptor for a strided memory access. struct StrideDescriptor { + StrideDescriptor() = default; StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, unsigned Align) : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} - StrideDescriptor() = default; - // The access's stride. It is negative for a reverse access. int64_t Stride = 0; - const SCEV *Scev = nullptr; // The scalar expression of this access - uint64_t Size = 0; // The size of the memory object. - unsigned Align = 0; // The alignment of this access. + + // The scalar expression of this access. + const SCEV *Scev = nullptr; + + // The size of the memory object. + uint64_t Size = 0; + + // The alignment of this access. + unsigned Align = 0; }; /// \brief A type for holding instructions and their stride descriptors. - typedef std::pair<Instruction *, StrideDescriptor> StrideEntry; + using StrideEntry = std::pair<Instruction *, StrideDescriptor>; /// \brief Create a new interleave group with the given instruction \p Instr, /// stride \p Stride and alignment \p Align. @@ -1179,7 +1133,6 @@ private: /// not necessary or is prevented because \p A and \p B may be dependent. bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A, StrideEntry *B) const { - // Code motion for interleaved accesses can potentially hoist strided loads // and sink strided stores. The code below checks the legality of the // following two conditions: @@ -1244,7 +1197,7 @@ private: /// for example 'force', means a decision has been made. So, we need to be /// careful NOT to add them if the user hasn't specifically asked so. class LoopVectorizeHints { - enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE }; + enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; /// Hint - associates name and validation with the hint value. struct Hint { @@ -1263,6 +1216,8 @@ class LoopVectorizeHints { return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: return (Val <= 1); + case HK_ISVECTORIZED: + return (Val==0 || Val==1); } return false; } @@ -1270,16 +1225,21 @@ class LoopVectorizeHints { /// Vectorization width. Hint Width; + /// Vectorization interleave factor. Hint Interleave; + /// Vectorization forced Hint Force; + /// Already Vectorized + Hint IsVectorized; + /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } /// True if there is any unsafe math in the loop. - bool PotentiallyUnsafe; + bool PotentiallyUnsafe = false; public: enum ForceKind { @@ -1294,7 +1254,7 @@ public: HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), - PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) { + IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -1302,14 +1262,19 @@ public: if (VectorizerParams::isInterleaveForced()) Interleave.Value = VectorizerParams::VectorizationInterleave; + if (IsVectorized.Value != 1) + // If the vectorization width and interleaving count are both 1 then + // consider the loop to have been already vectorized because there's + // nothing more that we can do. + IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } /// Mark the loop L as already vectorized by setting the width to 1. void setAlreadyVectorized() { - Width.Value = Interleave.Value = 1; - Hint Hints[] = {Width, Interleave}; + IsVectorized.Value = 1; + Hint Hints[] = {IsVectorized}; writeHintsToMetadata(Hints); } @@ -1326,19 +1291,19 @@ public: return false; } - if (getWidth() == 1 && getInterleave() == 1) { - // FIXME: Add a separate metadata to indicate when the loop has already - // been vectorized instead of setting width and count to 1. + if (getIsVectorized() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); // FIXME: Add interleave.disable metadata. This will allow // vectorize.disable to be used without disabling the pass and errors // to differentiate between disabled vectorization and a width of 1. - ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), + ORE.emit([&]() { + return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), "AllDisabled", L->getStartLoc(), L->getHeader()) << "loop not vectorized: vectorization and interleaving are " - "explicitly disabled, or vectorize width and interleave " - "count are both set to 1"); + "explicitly disabled, or the loop has already been " + "vectorized"; + }); return false; } @@ -1348,29 +1313,35 @@ public: /// Dumps all the hint information. void emitRemarkWithHints() const { using namespace ore; - if (Force.Value == LoopVectorizeHints::FK_Disabled) - ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", + + ORE.emit([&]() { + if (Force.Value == LoopVectorizeHints::FK_Disabled) + return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", TheLoop->getStartLoc(), TheLoop->getHeader()) - << "loop not vectorized: vectorization is explicitly disabled"); - else { - OptimizationRemarkMissed R(LV_NAME, "MissedDetails", - TheLoop->getStartLoc(), TheLoop->getHeader()); - R << "loop not vectorized"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=" << NV("Force", true); - if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); - if (Interleave.Value != 0) - R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); - R << ")"; + << "loop not vectorized: vectorization is explicitly disabled"; + else { + OptimizationRemarkMissed R(LV_NAME, "MissedDetails", + TheLoop->getStartLoc(), + TheLoop->getHeader()); + R << "loop not vectorized"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=" << NV("Force", true); + if (Width.Value != 0) + R << ", Vector Width=" << NV("VectorWidth", Width.Value); + if (Interleave.Value != 0) + R << ", Interleave Count=" + << NV("InterleaveCount", Interleave.Value); + R << ")"; + } + return R; } - ORE.emit(R); - } + }); } unsigned getWidth() const { return Width.Value; } unsigned getInterleave() const { return Interleave.Value; } + unsigned getIsVectorized() const { return IsVectorized.Value; } enum ForceKind getForce() const { return (ForceKind)Force.Value; } /// \brief If hints are provided that force vectorization, use the AlwaysPrint @@ -1454,7 +1425,7 @@ private: return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force}; + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -1489,7 +1460,7 @@ private: /// Sets current hints into loop metadata, keeping other values intact. void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { - if (HintTypes.size() == 0) + if (HintTypes.empty()) return; // Reserve the first element to LoopID (see below). @@ -1525,6 +1496,8 @@ private: OptimizationRemarkEmitter &ORE; }; +} // end anonymous namespace + static void emitMissedWarning(Function *F, Loop *L, const LoopVectorizeHints &LH, OptimizationRemarkEmitter *ORE) { @@ -1546,6 +1519,8 @@ static void emitMissedWarning(Function *F, Loop *L, } } +namespace { + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -1568,22 +1543,20 @@ public: std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, LoopVectorizeHints *H) - : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), - GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), - PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), + ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. - typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; + using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>; /// InductionList saves induction variables and maps them to the /// induction descriptor. - typedef MapVector<PHINode *, InductionDescriptor> InductionList; + using InductionList = MapVector<PHINode *, InductionDescriptor>; /// RecurrenceSet contains the phi nodes that are recurrences other than /// inductions and reductions. - typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet; + using RecurrenceSet = SmallPtrSet<const PHINode *, 8>; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this @@ -1608,7 +1581,17 @@ public: /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } - /// Returns True if V is an induction variable in this loop. + /// Returns True if V is a Phi node of an induction variable in this loop. + bool isInductionPhi(const Value *V); + + /// Returns True if V is a cast that is part of an induction def-use chain, + /// and had been proven to be redundant under a runtime guard (in other + /// words, the cast has the same SCEV expression as the induction phi). + bool isCastedInductionVariable(const Value *V); + + /// Returns True if V can be considered as an induction variable in this + /// loop. V can be the induction phi, or some redundant cast in the def-use + /// chain of the inducion phi. bool isInductionVariable(const Value *V); /// Returns True if PN is a reduction variable in this loop. @@ -1629,6 +1612,8 @@ public: /// 0 - Stride is unknown or non-consecutive. /// 1 - Address is consecutive. /// -1 - Address is consecutive, and decreasing. + /// NOTE: This method must only be used before modifying the original scalar + /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). int isConsecutivePtr(Value *Ptr); /// Returns true if the value V is uniform within the loop. @@ -1646,11 +1631,6 @@ public: return InterleaveInfo.isInterleaved(Instr); } - /// \brief Return the maximum interleave factor of all interleaved groups. - unsigned getMaxInterleaveFactor() const { - return InterleaveInfo.getMaxInterleaveFactor(); - } - /// \brief Get the interleaved access group that \p Instr belongs to. const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { return InterleaveInfo.getInterleaveGroup(Instr); @@ -1664,6 +1644,10 @@ public: unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } + uint64_t getMaxSafeRegisterWidth() const { + return LAI->getDepChecker().getMaxSafeRegisterWidth(); + } + bool hasStride(Value *V) { return LAI->hasStride(V); } /// Returns true if the target machine supports masked store operation @@ -1671,21 +1655,25 @@ public: bool isLegalMaskedStore(Type *DataType, Value *Ptr) { return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); } + /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } + /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. bool isLegalMaskedScatter(Type *DataType) { return TTI->isLegalMaskedScatter(DataType); } + /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. bool isLegalMaskedGather(Type *DataType) { return TTI->isLegalMaskedGather(DataType); } + /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. bool isLegalGatherOrScatter(Value *V) { @@ -1701,6 +1689,7 @@ public: /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } + unsigned getNumStores() const { return LAI->getNumStores(); } unsigned getNumLoads() const { return LAI->getNumLoads(); } unsigned getNumPredStores() const { return NumPredStores; } @@ -1766,27 +1755,34 @@ private: return LAI ? &LAI->getSymbolicStrides() : nullptr; } - unsigned NumPredStores; + unsigned NumPredStores = 0; /// The loop that we evaluate. Loop *TheLoop; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. /// Applies dynamic knowledge to simplify SCEV expressions in the context /// of existing SCEV assumptions. The analysis will also add a minimal set /// of new predicates if this is required to enable vectorization and /// unrolling. PredicatedScalarEvolution &PSE; + /// Target Library Info. TargetLibraryInfo *TLI; + /// Target Transform Info const TargetTransformInfo *TTI; + /// Dominator Tree. DominatorTree *DT; + // LoopAccess analysis. std::function<const LoopAccessInfo &(Loop &)> *GetLAA; + // And the loop-accesses info corresponding to this loop. This pointer is // null until canVectorizeMemory sets it up. - const LoopAccessInfo *LAI; + const LoopAccessInfo *LAI = nullptr; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; @@ -1798,27 +1794,38 @@ private: /// Holds the primary induction variable. This is the counter of the /// loop. - PHINode *PrimaryInduction; + PHINode *PrimaryInduction = nullptr; + /// Holds the reduction variables. ReductionList Reductions; + /// Holds all of the induction variables that we found in the loop. /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + + /// Holds all the casts that participate in the update chain of the induction + /// variables, and that have been proven to be redundant (possibly under a + /// runtime guard). These casts can be ignored when creating the vectorized + /// loop body. + SmallPtrSet<Instruction *, 4> InductionCastsToIgnore; + /// Holds the phi nodes that are first-order recurrences. RecurrenceSet FirstOrderRecurrences; + /// Holds instructions that need to sink past other instructions to handle /// first-order recurrences. DenseMap<Instruction *, Instruction *> SinkAfter; + /// Holds the widest induction type encountered. - Type *WidestIndTy; + Type *WidestIndTy = nullptr; /// Allowed outside users. This holds the induction and reduction /// vars which can be accessed from outside the loop. SmallPtrSet<Value *, 4> AllowedExit; /// Can we assume the absence of NaNs. - bool HasFunNoNaNAttr; + bool HasFunNoNaNAttr = false; /// Vectorization requirements that will go through late-evaluation. LoopVectorizationRequirements *Requirements; @@ -1856,9 +1863,13 @@ public: /// Information about vectorization costs struct VectorizationFactor { - unsigned Width; // Vector width with best cost - unsigned Cost; // Cost of the loop with that width + // Vector width with best cost + unsigned Width; + + // Cost of the loop with that width + unsigned Cost; }; + /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is @@ -1897,8 +1908,10 @@ public: struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. unsigned LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. unsigned MaxLocalUsers; + /// Holds the number of instructions in the loop. unsigned NumInstructions; }; @@ -1920,6 +1933,7 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -1954,7 +1968,8 @@ public: /// Decision that was taken during cost calculation for memory instruction. enum InstWidening { CM_Unknown, - CM_Widen, + CM_Widen, // For consecutive accesses with stride +1. + CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, CM_Scalarize @@ -2010,7 +2025,6 @@ public: /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { - // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast<TruncInst>(I); if (!Trunc) @@ -2030,13 +2044,29 @@ public: return false; // If the truncated value is not an induction variable, return false. - return Legal->isInductionVariable(Op); + return Legal->isInductionPhi(Op); + } + + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); } private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize); + unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -2045,7 +2075,7 @@ private: /// is /// false, then all operations will be scalarized (i.e. no vectorization has /// actually taken place). - typedef std::pair<unsigned, bool> VectorizationCostTy; + using VectorizationCostTy = std::pair<unsigned, bool>; /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different @@ -2102,7 +2132,7 @@ private: /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. - typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; + using ScalarCostsTy = DenseMap<Instruction *, unsigned>; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. @@ -2134,10 +2164,6 @@ private: int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, unsigned VF); - /// Collects the instructions to scalarize for each predicated instruction in - /// the loop. - void collectInstsToScalarize(unsigned VF); - /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of @@ -2156,72 +2182,137 @@ private: /// iteration of the original scalar loop. void collectLoopScalars(unsigned VF); - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { - // Do the analysis once. - if (VF == 1 || Uniforms.count(VF)) - return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); - } - /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - typedef DenseMap<std::pair<Instruction *, unsigned>, - std::pair<InstWidening, unsigned>> - DecisionList; + using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, + std::pair<InstWidening, unsigned>>; DecisionList WideningDecisions; public: /// The loop that we evaluate. Loop *TheLoop; + /// Predicated scalar evolution analysis. PredicatedScalarEvolution &PSE; + /// Loop Info analysis. LoopInfo *LI; + /// Vectorization legality. LoopVectorizationLegality *Legal; + /// Vector target information. const TargetTransformInfo &TTI; + /// Target Library Info. const TargetLibraryInfo *TLI; + /// Demanded bits analysis. DemandedBits *DB; + /// Assumption cache. AssumptionCache *AC; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; const Function *TheFunction; + /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// Values to ignore in the cost model. SmallPtrSet<const Value *, 16> ValuesToIgnore; + /// Values to ignore in the cost model when VF > 1. SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; +} // end anonymous namespace + +namespace llvm { + +/// InnerLoopVectorizer vectorizes loops which contain only one basic /// LoopVectorizationPlanner - drives the vectorization process after having /// passed Legality checks. +/// The planner builds and optimizes the Vectorization Plans which record the +/// decisions how to vectorize the given loop. In particular, represent the +/// control-flow of the vectorized version, the replication of instructions that +/// are to be scalarized, and interleave access groups. class LoopVectorizationPlanner { + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// Target Library Info. + const TargetLibraryInfo *TLI; + + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + + /// The profitablity analysis. + LoopVectorizationCostModel &CM; + + using VPlanPtr = std::unique_ptr<VPlan>; + + SmallVector<VPlanPtr, 4> VPlans; + + /// This class is used to enable the VPlan to invoke a method of ILV. This is + /// needed until the method is refactored out of ILV and becomes reusable. + struct VPCallbackILV : public VPCallback { + InnerLoopVectorizer &ILV; + + VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} + + Value *getOrCreateVectorValues(Value *V, unsigned Part) override { + return ILV.getOrCreateVectorValue(V, Part); + } + }; + + /// A builder used to construct the current plan. + VPBuilder Builder; + + /// When we if-convert we need to create edge masks. We have to cache values + /// so that we don't end up with exponential recursion/IR. Note that + /// if-conversion currently takes place during VPlan-construction, so these + /// caches are only used at that stage. + using EdgeMaskCacheTy = + DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; + using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; + EdgeMaskCacheTy EdgeMaskCache; + BlockMaskCacheTy BlockMaskCache; + + unsigned BestVF = 0; + unsigned BestUF = 0; + public: - LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM) - : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} - - ~LoopVectorizationPlanner() {} + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} /// Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); - /// Generate the IR code for the vectorized loop. - void executePlan(InnerLoopVectorizer &ILV); + /// Finalize the best decision and dispose of all other VPlans. + void setBestPlan(unsigned VF, unsigned UF); + + /// Generate the IR code for the body of the vectorized loop according to the + /// best selected VPlan. + void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + + void printPlans(raw_ostream &O) { + for (const auto &Plan : VPlans) + O << *Plan; + } protected: /// Collect the instructions from the original loop that would be trivially @@ -2229,20 +2320,102 @@ protected: void collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions); -private: - /// The loop that we evaluate. - Loop *OrigLoop; + /// A range of powers-of-2 vectorization factors with fixed start and + /// adjustable end. The range includes start and excludes end, e.g.,: + /// [1, 9) = {1, 2, 4, 8} + struct VFRange { + // A power of 2. + const unsigned Start; - /// Loop Info analysis. - LoopInfo *LI; + // Need not be a power of 2. If End <= Start range is empty. + unsigned End; + }; - /// The legality analysis. - LoopVectorizationLegality *Legal; + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying + /// \p Predicate on Range.Start, possibly decreasing Range.End such that the + /// returned value holds for the entire \p Range. + bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, + VFRange &Range); - /// The profitablity analysis. - LoopVectorizationCostModel &CM; + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, + /// according to the information gathered by Legal when it checked if it is + /// legal to vectorize the loop. + void buildVPlans(unsigned MinVF, unsigned MaxVF); + +private: + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); + + /// A helper function that computes the predicate of the edge between SRC + /// and DST. + VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + + /// Check if \I belongs to an Interleave Group within the given VF \p Range, + /// \return true in the first returned value if so and false otherwise. + /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG + /// for \p Range.Start, and provide it as the second returned value. + /// Note that if \I is an adjunct member of an IG for \p Range.Start, the + /// \return value is <true, nullptr>, as it is handled by another recipe. + /// \p Range.End may be decreased to ensure same decision from \p Range.Start + /// to \p Range.End. + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + + // Check if \I is a memory instruction to be widened for \p Range.Start and + // potentially masked. Such instructions are handled by a recipe that takes an + // additional VPInstruction for the mask. + VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, + VFRange &Range, + VPlanPtr &Plan); + + /// Check if an induction recipe should be constructed for \I within the given + /// VF \p Range. If so build and return it. If not, return null. \p Range.End + /// may be decreased to ensure same decision from \p Range.Start to + /// \p Range.End. + VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, + VFRange &Range); + + /// Handle non-loop phi nodes. Currently all such phi nodes are turned into + /// a sequence of select instructions as the vectorizer currently performs + /// full if-conversion. + VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); + + /// Check if \p I can be widened within the given VF \p Range. If \p I can be + /// widened for \p Range.Start, check if the last recipe of \p VPBB can be + /// extended to include \p I or else build a new VPWidenRecipe for it and + /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, + /// false otherwise. Range.End may be decreased to ensure same decision from + /// \p Range.Start to \p Range.End. + bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); + + /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it + /// is predicated. \return \p VPBB augmented with this new recipe if \p I is + /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new + /// Region. Update the packing decision of predicated instructions if they + /// feed \p I. Range.End may be decreased to ensure same recipe behavior from + /// \p Range.Start to \p Range.End. + VPBasicBlock *handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, + VPlanPtr &Plan); + + /// Create a replicating region for instruction \p I that requires + /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. + VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, + VPlanPtr &Plan); + + /// Build a VPlan according to the information gathered by Legal. \return a + /// VPlan for vectorization factors \p Range.Start and up to \p Range.End + /// exclusive, possibly decreasing \p Range.End. + VPlanPtr buildVPlan(VFRange &Range, + const SmallPtrSetImpl<Value *> &NeedDef); }; +} // end namespace llvm + +namespace { + /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2257,8 +2430,7 @@ private: /// followed by a non-expert user. class LoopVectorizationRequirements { public: - LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) - : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {} + LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} void addUnsafeAlgebraInst(Instruction *I) { // First unsafe algebra instruction. @@ -2272,12 +2444,14 @@ public: const char *PassName = Hints.vectorizeAnalysisPassName(); bool Failed = false; if (UnsafeAlgebraInst && !Hints.allowReordering()) { - ORE.emit( - OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps", - UnsafeAlgebraInst->getDebugLoc(), - UnsafeAlgebraInst->getParent()) - << "loop not vectorized: cannot prove it is safe to reorder " - "floating-point operations"); + ORE.emit([&]() { + return OptimizationRemarkAnalysisFPCommute( + PassName, "CantReorderFPOps", + UnsafeAlgebraInst->getDebugLoc(), + UnsafeAlgebraInst->getParent()) + << "loop not vectorized: cannot prove it is safe to reorder " + "floating-point operations"; + }); Failed = true; } @@ -2288,11 +2462,13 @@ public: NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; if ((ThresholdReached && !Hints.allowReordering()) || PragmaThresholdReached) { - ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", + ORE.emit([&]() { + return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", L->getStartLoc(), L->getHeader()) << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"); + "memory operations"; + }); DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Failed = true; } @@ -2301,13 +2477,15 @@ public: } private: - unsigned NumRuntimePointerChecks; - Instruction *UnsafeAlgebraInst; + unsigned NumRuntimePointerChecks = 0; + Instruction *UnsafeAlgebraInst = nullptr; /// Interface to emit optimization remarks. OptimizationRemarkEmitter &ORE; }; +} // end anonymous namespace + static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { if (L.empty()) { if (!hasCyclesInLoopBody(L)) @@ -2318,11 +2496,15 @@ static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { addAcyclicInnerLoop(*InnerL, V); } +namespace { + /// The LoopVectorize Pass. struct LoopVectorize : public FunctionPass { /// Pass identification, replacement for typeid static char ID; + LoopVectorizePass Impl; + explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) : FunctionPass(ID) { Impl.DisableUnrolling = NoUnrolling; @@ -2330,8 +2512,6 @@ struct LoopVectorize : public FunctionPass { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } - LoopVectorizePass Impl; - bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; @@ -2450,6 +2630,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); + recordVectorLoopValueForInductionCast(II, LastInduction, Part); if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); LastInduction = cast<Instruction>(addFastMathFlag( @@ -2480,11 +2661,26 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { auto *I = cast<Instruction>(U); return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); }; - return any_of(IV->users(), isScalarInst); + return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( + const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, + unsigned Lane) { + const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); + if (Casts.empty()) + return; + // Only the first Cast instruction in the Casts vector is of interest. + // The rest of the Casts (if exist) have no uses outside the + // induction update chain itself. + Instruction *CastInst = *Casts.begin(); + if (Lane < UINT_MAX) + VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); + else + VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); +} +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2564,6 +2760,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { Value *EntryPart = getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); + recordVectorLoopValueForInductionCast(ID, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); } @@ -2622,7 +2819,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, // Floating point operations had to be 'fast' to enable the induction. FastMathFlags Flags; - Flags.setUnsafeAlgebra(); + Flags.setFast(); Value *MulOp = Builder.CreateFMul(Cv, Step); if (isa<Instruction>(MulOp)) @@ -2638,7 +2835,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, const InductionDescriptor &ID) { - // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); @@ -2663,21 +2859,21 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF; - + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 + : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); + VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); + recordVectorLoopValueForInductionCast(ID, Add, Part, Lane); } } } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); @@ -2708,8 +2904,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // instead. If it has been scalarized, and we actually need the value in // vector form, we will construct the vector values on demand. if (VectorLoopValueMap.hasAnyScalarValue(V)) { - - Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); + Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); // If we've scalarized a value, that value should be an instruction. auto *I = cast<Instruction>(V); @@ -2726,8 +2921,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; - auto *LastInst = - cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); + auto *LastInst = cast<Instruction>( + VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -2744,14 +2939,15 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { Value *VectorValue = nullptr; if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(ScalarValue); + VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); + // Initialize packing with insertelements to start from undef. + Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); + VectorLoopValueMap.setVectorValue(V, Part, Undef); for (unsigned Lane = 0; Lane < VF; ++Lane) - VectorValue = Builder.CreateInsertElement( - VectorValue, getOrCreateScalarValue(V, Part, Lane), - Builder.getInt32(Lane)); + packScalarIntoVectorValue(V, {Part, Lane}); + VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } - VectorLoopValueMap.setVectorValue(V, Part, VectorValue); Builder.restoreIP(OldIP); return VectorValue; } @@ -2763,28 +2959,29 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { return B; } -Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, - unsigned Lane) { - +Value * +InnerLoopVectorizer::getOrCreateScalarValue(Value *V, + const VPIteration &Instance) { // If the value is not an instruction contained in the loop, it should // already be scalar. if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) - : true && "Uniform values only have lane zero"); + assert(Instance.Lane > 0 + ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) - return VectorLoopValueMap.getScalarValue(V, Part, Lane); + if (VectorLoopValueMap.hasScalarValue(V, Instance)) + return VectorLoopValueMap.getScalarValue(V, Instance); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the // vectorization factor is one), there is no need to generate an // extractelement instruction. - auto *U = getOrCreateVectorValue(V, Part); + auto *U = getOrCreateVectorValue(V, Instance.Part); if (!U->getType()->isVectorTy()) { assert(VF == 1 && "Value not scalarized has non-vector type"); return U; @@ -2793,7 +2990,20 @@ Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); + return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); +} + +void InnerLoopVectorizer::packScalarIntoVectorValue( + Value *V, const VPIteration &Instance) { + assert(V != Induction && "The new induction variable should not be used."); + assert(!V->getType()->isVectorTy() && "Can't pack a vector"); + assert(!V->getType()->isVoidTy() && "Type does not produce a value"); + + Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); + Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); + VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, + Builder.getInt32(Instance.Lane)); + VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2843,6 +3053,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (Instr != Group->getInsertPos()) return; + const DataLayout &DL = Instr->getModule()->getDataLayout(); Value *Ptr = getPointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. @@ -2866,7 +3077,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); + Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2890,13 +3101,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { - // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { auto *NewLoad = Builder.CreateAlignedLoad( NewPtrs[Part], Group->getAlignment(), "wide.vec"); - addMetadata(NewLoad, Instr); + Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2917,7 +3127,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), VF); - StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); + StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } if (Group->isReverse()) @@ -2946,9 +3156,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (Group->isReverse()) StoredVec = reverseVector(StoredVec); - // If this member has different type, cast it to an unified type. + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) - StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); + StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); StoredVecs.push_back(StoredVec); } @@ -2963,11 +3174,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Instruction *NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); - addMetadata(NewStoreInstr, Instr); + + Group->addMetadata(NewStoreInstr); } } -void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { +void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, + VectorParts *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2992,14 +3205,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = getMemInstAddressSpace(Instr); - // Scalarize the memory instruction if necessary. - if (Decision == LoopVectorizationCostModel::CM_Scalarize) - return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); - // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; + bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); + bool ConsecutiveStride = + Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); bool CreateGatherScatter = (Decision == LoopVectorizationCostModel::CM_GatherScatter); @@ -3010,9 +3220,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Handle consecutive loads/stores. if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, 0, 0); + Ptr = getOrCreateScalarValue(Ptr, {0, 0}); + + VectorParts Mask; + bool isMaskRequired = BlockInMask; + if (isMaskRequired) + Mask = *BlockInMask; - VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: if (SI) { assert(!Legal->isUniform(SI->getPointerOperand()) && @@ -3023,7 +3237,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Instruction *NewSI = nullptr; Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { - Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; + Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; Value *VectorGep = getOrCreateVectorValue(Ptr, Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); @@ -3045,13 +3259,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - Mask[Part] = reverseVector(Mask[Part]); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + Mask[Part] = reverseVector(Mask[Part]); } Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - if (Legal->isMaskRequired(SI)) + if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask[Part]); else @@ -3068,7 +3283,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { - Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; + Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; Value *VectorGep = getOrCreateVectorValue(Ptr, Part); NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); @@ -3083,12 +3298,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // wide load needs to start at the last vector element. PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - Mask[Part] = reverseVector(Mask[Part]); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + Mask[Part] = reverseVector(Mask[Part]); } Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - if (Legal->isMaskRequired(LI)) + if (isMaskRequired) NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], UndefValue::get(DataTy), "wide.masked.load"); @@ -3105,71 +3321,41 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + const VPIteration &Instance, bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - DEBUG(dbgs() << "LV: Scalarizing" - << (IfPredicateInstr ? " and predicating:" : ":") << *Instr - << '\n'); - // Holds vector parameters or scalars, in case of uniform vals. - SmallVector<VectorParts, 4> Params; setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); - - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; - - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { - // For each scalar that we create: - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); - // Start if-block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - Cmp = Cond[Part]; - if (Cmp->getType()->isVectorTy()) - Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, - ConstantInt::get(Cmp->getType(), 1)); - } - - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); - Cloned->setOperand(op, NewOp); - } - addNewMetadata(Cloned, Instr); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); + Cloned->setOperand(op, NewOp); + } + addNewMetadata(Cloned, Instr); - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); - // Add the cloned scalar to the scalar map entry. - VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); + // Add the cloned scalar to the scalar map entry. + VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + // If we just cloned a new assumption, add it the assumption cache. + if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); - } - } + // End if-block. + if (IfPredicateInstr) + PredicatedInstructions.push_back(Cloned); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3281,6 +3467,36 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { return VectorTripCount; } +Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, + const DataLayout &DL) { + // Verify that V is a vector type with same number of elements as DstVTy. + unsigned VF = DstVTy->getNumElements(); + VectorType *SrcVecTy = cast<VectorType>(V->getType()); + assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); + Type *SrcElemTy = SrcVecTy->getElementType(); + Type *DstElemTy = DstVTy->getElementType(); + assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && + "Vector elements must have same size"); + + // Do a direct cast if element types are castable. + if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { + return Builder.CreateBitOrPointerCast(V, DstVTy); + } + // V cannot be directly casted to desired vector type. + // May happen when V is a floating point vector but DstVTy is a vector of + // pointers or vice-versa. Handle this using a two-step bitcast using an + // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. + assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && + "Only one type should be a pointer type"); + assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && + "Only one type should be a floating point type"); + Type *IntTy = + IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); + VectorType *VecIntTy = VectorType::get(IntTy, VF); + Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); + return Builder.CreateBitOrPointerCast(CastVal, DstVTy); +} + void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); @@ -3373,7 +3589,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createVectorizedLoopSkeleton() { +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3435,7 +3651,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() { MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); // Create and register the new vector loop. - Loop *Lp = new Loop(); + Loop *Lp = LI->AllocateLoop(); Loop *ParentLoop = OrigLoop->getParentLoop(); // Insert the new loop into the loop nest and register the new basic blocks @@ -3554,6 +3770,8 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); + + return LoopVectorPreHeader; } // Fix up external users of the induction variable. At this point, we are @@ -3622,22 +3840,27 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, } namespace { + struct CSEDenseMapInfo { static bool canHandle(const Instruction *I) { return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); } + static inline Instruction *getEmptyKey() { return DenseMapInfo<Instruction *>::getEmptyKey(); } + static inline Instruction *getTombstoneKey() { return DenseMapInfo<Instruction *>::getTombstoneKey(); } + static unsigned getHashValue(const Instruction *I) { assert(canHandle(I) && "Unknown instruction!"); return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), I->value_op_end())); } + static bool isEqual(const Instruction *LHS, const Instruction *RHS) { if (LHS == getEmptyKey() || RHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getTombstoneKey()) @@ -3645,7 +3868,8 @@ struct CSEDenseMapInfo { return LHS->isIdenticalTo(RHS); } }; -} + +} // end anonymous namespace ///\brief Perform cse of induction variable instructions. static void cse(BasicBlock *BB) { @@ -3777,7 +4001,6 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // For every instruction `I` in MinBWs, truncate the operands, create a // truncated version of `I` and reextend its result. InstCombine runs // later and will remove any ext/trunc pairs. - // SmallPtrSet<Value *, 4> Erased; for (const auto &KV : Cost->getMinimalBitwidths()) { // If the value wasn't vectorized, we must maintain the original scalar @@ -3927,7 +4150,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() { IVEndValues[Entry.first], LoopMiddleBlock); fixLCSSAPHIs(); - predicateInstructions(); + for (Instruction *PI : PredicatedInstructions) + sinkScalarOperands(&*PI); // Remove redundant induction instructions. cse(LoopVectorBody); @@ -3953,7 +4177,6 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() { } void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { - // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. @@ -4211,7 +4434,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // entire expression in the smaller type. if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - Builder.SetInsertPoint(LoopVectorBody->getTerminator()); + Builder.SetInsertPoint( + LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); @@ -4317,7 +4541,6 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { } void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { - // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); auto *VectorLoop = LI->getLoopFor(PredBB); @@ -4346,7 +4569,6 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // through the worklist doesn't sink a single instruction. bool Changed; do { - // Add the instructions that need to be reanalyzed to the worklist, and // reset the changed indicator. Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); @@ -4365,7 +4587,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // It's legal to sink the instruction if all its uses occur in the // predicated block. Otherwise, there's nothing to do yet, and we may // need to reanalyze the instruction. - if (!all_of(I->uses(), isBlockOfUsePredicated)) { + if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { InstsToReanalyze.push_back(I); continue; } @@ -4382,200 +4604,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } -void InnerLoopVectorizer::predicateInstructions() { - - // For each instruction I marked for predication on value C, split I into its - // own basic block to form an if-then construct over C. Since I may be fed by - // an extractelement instruction or other scalar operand, we try to - // iteratively sink its scalar operands into the predicated block. If I feeds - // an insertelement instruction, we try to move this instruction into the - // predicated block as well. For non-void types, a phi node will be created - // for the resulting value (either vector or scalar). - // - // So for some predicated instruction, e.g. the conditional sdiv in: - // - // for.body: - // ... - // %add = add nsw i32 %mul, %0 - // %cmp5 = icmp sgt i32 %2, 7 - // br i1 %cmp5, label %if.then, label %if.end - // - // if.then: - // %div = sdiv i32 %0, %1 - // br label %if.end - // - // if.end: - // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] - // - // the sdiv at this point is scalarized and if-converted using a select. - // The inactive elements in the vector are not used, but the predicated - // instruction is still executed for all vector elements, essentially: - // - // vector.body: - // ... - // %17 = add nsw <2 x i32> %16, %wide.load - // %29 = extractelement <2 x i32> %wide.load, i32 0 - // %30 = extractelement <2 x i32> %wide.load51, i32 0 - // %31 = sdiv i32 %29, %30 - // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 - // %35 = extractelement <2 x i32> %wide.load, i32 1 - // %36 = extractelement <2 x i32> %wide.load51, i32 1 - // %37 = sdiv i32 %35, %36 - // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 - // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 - // - // Predication will now re-introduce the original control flow to avoid false - // side-effects by the sdiv instructions on the inactive elements, yielding - // (after cleanup): - // - // vector.body: - // ... - // %5 = add nsw <2 x i32> %4, %wide.load - // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7> - // %9 = extractelement <2 x i1> %8, i32 0 - // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue - // - // pred.sdiv.if: - // %10 = extractelement <2 x i32> %wide.load, i32 0 - // %11 = extractelement <2 x i32> %wide.load51, i32 0 - // %12 = sdiv i32 %10, %11 - // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 - // br label %pred.sdiv.continue - // - // pred.sdiv.continue: - // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] - // %15 = extractelement <2 x i1> %8, i32 1 - // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 - // - // pred.sdiv.if54: - // %16 = extractelement <2 x i32> %wide.load, i32 1 - // %17 = extractelement <2 x i32> %wide.load51, i32 1 - // %18 = sdiv i32 %16, %17 - // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 - // br label %pred.sdiv.continue55 - // - // pred.sdiv.continue55: - // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] - // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 - - for (auto KV : PredicatedInstructions) { - BasicBlock::iterator I(KV.first); - BasicBlock *Head = I->getParent(); - auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, - /*BranchWeights=*/nullptr, DT, LI); - I->moveBefore(T); - sinkScalarOperands(&*I); - - BasicBlock *PredicatedBlock = I->getParent(); - Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); - PredicatedBlock->setName(BBNamePrefix + ".if"); - PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); - - // If the instruction is non-void create a Phi node at reconvergence point. - if (!I->getType()->isVoidTy()) { - Value *IncomingTrue = nullptr; - Value *IncomingFalse = nullptr; - - if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) { - // If the predicated instruction is feeding an insert-element, move it - // into the Then block; Phi node will be created for the vector. - InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin()); - IEI->moveBefore(T); - IncomingTrue = IEI; // the new vector with the inserted element. - IncomingFalse = IEI->getOperand(0); // the unmodified vector - } else { - // Phi node will be created for the scalar predicated instruction. - IncomingTrue = &*I; - IncomingFalse = UndefValue::get(I->getType()); - } - - BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); - assert(PostDom && "Then block has multiple successors"); - PHINode *Phi = - PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); - IncomingTrue->replaceAllUsesWith(Phi); - Phi->addIncoming(IncomingFalse, Head); - Phi->addIncoming(IncomingTrue, I->getParent()); - } - } - - DEBUG(DT->verifyDomTree()); -} - -InnerLoopVectorizer::VectorParts -InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { - assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - - // Look for cached value. - std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); - EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); - if (ECEntryIt != EdgeMaskCache.end()) - return ECEntryIt->second; - - VectorParts SrcMask = createBlockInMask(Src); - - // The terminator has to be a branch inst! - BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); - assert(BI && "Unexpected terminator found"); - - if (BI->isConditional()) { - - VectorParts EdgeMask(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part); - if (BI->getSuccessor(0) != Dst) - EdgeMaskPart = Builder.CreateNot(EdgeMaskPart); - - EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]); - EdgeMask[Part] = EdgeMaskPart; - } - - EdgeMaskCache[Edge] = EdgeMask; - return EdgeMask; - } - - EdgeMaskCache[Edge] = SrcMask; - return SrcMask; -} - -InnerLoopVectorizer::VectorParts -InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - - // Look for cached value. - BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); - if (BCEntryIt != BlockMaskCache.end()) - return BCEntryIt->second; - - VectorParts BlockMask(UF); - - // Loop incoming mask is all-one. - if (OrigLoop->getHeader() == BB) { - Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); - for (unsigned Part = 0; Part < UF; ++Part) - BlockMask[Part] = getOrCreateVectorValue(C, Part); - BlockMaskCache[BB] = BlockMask; - return BlockMask; - } - - // This is the block mask. We OR all incoming edges, and with zero. - Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); - for (unsigned Part = 0; Part < UF; ++Part) - BlockMask[Part] = getOrCreateVectorValue(Zero, Part); - - // For each pred: - for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) { - VectorParts EM = createEdgeMask(*It, BB); - for (unsigned Part = 0; Part < UF; ++Part) - BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]); - } - - BlockMaskCache[BB] = BlockMask; - return BlockMask; -} - void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { + assert(PN->getParent() == OrigLoop->getHeader() && + "Non-header phis should have been handled elsewhere"); + PHINode *P = cast<PHINode>(PN); // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is @@ -4594,43 +4627,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, } setDebugLocFromInst(Builder, P); - // Check for PHI nodes that are lowered to vector selects. - if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = P->getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) - VectorParts Entry(UF); - for (unsigned In = 0; In < NumIncoming; In++) { - VectorParts Cond = - createEdgeMask(P->getIncomingBlock(In), P->getParent()); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part); - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - if (In == 0) - Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0); - else - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part], - "predphi"); - } - } - for (unsigned Part = 0; Part < UF; ++Part) - VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]); - return; - } // This PHINode must be an induction variable. // Make sure that we know about it. @@ -4646,7 +4642,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: case InductionDescriptor::IK_FpInduction: - return widenIntOrFpInduction(P); + llvm_unreachable("Integer/fp induction is handled elsewhere."); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4665,7 +4661,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); + VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); } } return; @@ -4691,25 +4687,11 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { - // Scalarize instructions that should remain scalar after vectorization. - if (VF > 1 && - !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) && - shouldScalarizeInstruction(&I)) { - scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); - return; - } - +void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - break; - case Instruction::PHI: { - // Vectorize PHINodes. - widenPHIInstruction(&I, UF, VF); - break; - } // End of PHI. + case Instruction::PHI: + llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::GetElementPtr: { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -4746,7 +4728,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // values in the vector mapping with initVector, as we do for other // instructions. for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. auto *Ptr = @@ -4782,13 +4763,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // Scalarize with predication if this instruction may divide by zero and - // block execution is conditional, otherwise fallthrough. - if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); - break; - } - LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -4836,7 +4810,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); + auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); for (unsigned Part = 0; Part < UF; ++Part) { Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); @@ -4862,8 +4836,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); Value *C = nullptr; if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - cast<FCmpInst>(C)->copyFastMathFlags(Cmp); } else { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } @@ -4874,10 +4850,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { break; } - case Instruction::Store: - case Instruction::Load: - vectorizeMemoryInstruction(&I); - break; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -4893,16 +4865,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { auto *CI = dyn_cast<CastInst>(&I); setDebugLocFromInst(Builder, CI); - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - if (Cost->isOptimizableIVTruncate(CI, VF)) { - widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)), - cast<TruncInst>(CI)); - break; - } - /// Vectorize casts. Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4933,11 +4895,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(&I); - break; - } + // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? @@ -4945,10 +4903,8 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(&I); - break; - } + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; @@ -4998,9 +4954,9 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { } default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(&I); - break; + // This instruction is not vectorized by simple widening. + DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); } // end of switch. } @@ -5012,14 +4968,11 @@ void InnerLoopVectorizer::updateAnalysis() { assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); - DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), - LoopVectorPreHeader); DT->addNewBlock(LoopMiddleBlock, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - DEBUG(DT->verifyDomTree()); } @@ -5094,12 +5047,15 @@ bool LoopVectorizationLegality::canVectorize() { // Store the result and return it at the end instead of exiting early, in case // allowExtraAnalysis is used to report multiple reasons for not vectorizing. bool Result = true; + + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + if (DoExtraAnalysis) // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5112,7 +5068,7 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->empty()) { ORE->emit(createMissedAnalysis("NotInnermostLoop") << "loop is not the innermost loop"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5122,7 +5078,7 @@ bool LoopVectorizationLegality::canVectorize() { if (TheLoop->getNumBackEdges() != 1) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5132,7 +5088,7 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getExitingBlock()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5144,7 +5100,7 @@ bool LoopVectorizationLegality::canVectorize() { if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5158,7 +5114,7 @@ bool LoopVectorizationLegality::canVectorize() { unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5167,7 +5123,7 @@ bool LoopVectorizationLegality::canVectorize() { // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5176,7 +5132,7 @@ bool LoopVectorizationLegality::canVectorize() { // Go over each instruction and look at memory deps. if (!canVectorizeMemory()) { DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5207,7 +5163,7 @@ bool LoopVectorizationLegality::canVectorize() { << "Too many SCEV assumptions need to be made and checked " << "at runtime"); DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); - if (ORE->allowExtraAnalysis()) + if (DoExtraAnalysis) Result = false; else return false; @@ -5263,6 +5219,15 @@ void LoopVectorizationLegality::addInductionPhi( PHINode *Phi, const InductionDescriptor &ID, SmallPtrSetImpl<Value *> &AllowedExit) { Inductions[Phi] = ID; + + // In case this induction also comes with casts that we know we can ignore + // in the vectorized loop body, record them here. All casts could be recorded + // here for ignoring, but suffices to record only the first (as it is the + // only one that may bw used outside the cast sequence). + const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); + if (!Casts.empty()) + InductionCastsToIgnore.insert(*Casts.begin()); + Type *PhiTy = Phi->getType(); const DataLayout &DL = Phi->getModule()->getDataLayout(); @@ -5300,7 +5265,6 @@ void LoopVectorizationLegality::addInductionPhi( } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - return; } bool LoopVectorizationLegality::canVectorizeInstrs() { @@ -5439,7 +5403,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // operations, shuffles, or casts, as they don't change precision or // semantics. } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && - !I.hasUnsafeAlgebra()) { + !I.isFast()) { DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); Hints->setPotentiallyUnsafe(); } @@ -5451,7 +5415,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { << "value cannot be used outside the loop"); return false; } - } // next instr. } @@ -5474,7 +5437,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { - // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. @@ -5517,7 +5479,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in // PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5532,7 +5493,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // If the use of the pointer will be a scalar use, and all users of the // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, // place the pointer in PossibleNonScalarPtrs. - if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) { + if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { return isa<LoadInst>(U) || isa<StoreInst>(U); })) ScalarPtrs.insert(I); @@ -5604,7 +5565,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) continue; auto *Src = cast<Instruction>(Dst->getOperand(0)); - if (all_of(Src->users(), [&](User *U) -> bool { + if (llvm::all_of(Src->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); return !TheLoop->contains(J) || Worklist.count(J) || ((isa<LoadInst>(J) || isa<StoreInst>(J)) && @@ -5631,7 +5592,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // Determine if all users of the induction variable are scalar after // vectorization. - auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { + auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); }); @@ -5640,10 +5601,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // Determine if all users of the induction variable update instruction are // scalar after vectorization. - auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { - auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I); - }); + auto ScalarIndUpdate = + llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I); + }); if (!ScalarIndUpdate) continue; @@ -5703,7 +5665,6 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, } void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { - // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any @@ -5754,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { "Widening decision should be ready at this moment"); return (WideningDecision == CM_Widen || + WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); }; // Iterate over the instructions in the loop, and collect all @@ -5766,7 +5728,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // the getelementptr won't remain uniform. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - // If there's no pointer operand, there's nothing to do. auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I)); if (!Ptr) @@ -5774,9 +5735,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // True if all users of Ptr are memory accesses that have Ptr as their // pointer operand. - auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool { - return getPointerOperand(U) == Ptr; - }); + auto UsersAreMemAccesses = + llvm::all_of(Ptr->users(), [&](User *U) -> bool { + return getPointerOperand(U) == Ptr; + }); // Ensure the memory instruction will not be scalarized or used by // gather/scatter, making its pointer operand non-uniform. If the pointer @@ -5812,7 +5774,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { if (isOutOfScope(OV)) continue; auto *OI = cast<Instruction>(OV); - if (all_of(OI->users(), [&](User *U) -> bool { + if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); return !TheLoop->contains(J) || Worklist.count(J) || (OI == getPointerOperand(J) && isUniformDecision(J, VF)); @@ -5841,7 +5803,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Determine if all users of the induction variable are uniform after // vectorization. - auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool { + auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || isVectorizedMemAccessUse(I, Ind); @@ -5851,11 +5813,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Determine if all users of the induction variable update instruction are // uniform after vectorization. - auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { - auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || - isVectorizedMemAccessUse(I, IndUpdate); - }); + auto UniformIndUpdate = + llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + isVectorizedMemAccessUse(I, IndUpdate); + }); if (!UniformIndUpdate) continue; @@ -5874,9 +5837,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { InterleaveInfo.setLAI(LAI); const OptimizationRemarkAnalysis *LAR = LAI->getReport(); if (LAR) { - OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(), - "loop not vectorized: ", *LAR); - ORE->emit(VR); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + }); } if (!LAI->canVectorizeMemory()) return false; @@ -5894,7 +5858,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); if (!PN) @@ -5903,6 +5867,15 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) { return Inductions.count(PN); } +bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { + auto *Inst = dyn_cast<Instruction>(V); + return (Inst && InductionCastsToIgnore.count(Inst)); +} + +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { + return isInductionPhi(V) || isCastedInductionVariable(V); +} + bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { return FirstOrderRecurrences.count(Phi); } @@ -5972,7 +5945,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( void InterleavedAccessInfo::collectConstStrideAccesses( MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, const ValueToValueMap &Strides) { - auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); // Since it's desired that the load/store instructions be maintained in @@ -6126,7 +6098,6 @@ void InterleavedAccessInfo::analyzeInterleaving( // but not with (4). If we did, the dependent access (3) would be within // the boundaries of the (2, 4) group. if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) { - // If a dependence exists and A is already in a group, we know that A // must be a store since A precedes B and WAR dependences are allowed. // Thus, A would be sunk below B. We release A's group to prevent this @@ -6205,9 +6176,11 @@ void InterleavedAccessInfo::analyzeInterleaving( // Remove interleaved store groups with gaps. for (InterleaveGroup *Group : StoreGroups) - if (Group->getNumMembers() != Group->getFactor()) + if (Group->getNumMembers() != Group->getFactor()) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due " + "to gaps.\n"); releaseGroup(Group); - + } // Remove interleaved groups with gaps (currently only loads) whose memory // accesses may wrap around. We have to revisit the getPtrStride analysis, // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does @@ -6222,9 +6195,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // This means that we can forcefully peel the loop in order to only have to // check the first pointer for no-wrap. When we'll change to use Assume=true // we'll only need at most one runtime check per interleaved group. - // for (InterleaveGroup *Group : LoadGroups) { - // Case 1: A full group. Can Skip the checks; For full groups, if the wide // load would wrap around the address space we would do a memory access at // nullptr even without the transformation. @@ -6260,6 +6231,8 @@ void InterleavedAccessInfo::analyzeInterleaving( // to look for a member at index factor - 1, since every group must have // a member at index zero. if (Group->isReverse()) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + "a reverse access with gaps.\n"); releaseGroup(Group); continue; } @@ -6277,8 +6250,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { return None; } + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { + // TODO: It may by useful to do since it's still likely to be dynamically + // uniform if the target can skip. + DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + + ORE->emit( + createMissedAnalysis("CantVersionLoopWithDivergentTarget") + << "runtime pointer checks needed. Not enabled for divergent target"); + + return None; + } + + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize); + return computeFeasibleMaxVF(OptForSize, TC); if (Legal->getRuntimePointerChecking()->Need) { ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -6291,7 +6277,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { } // If we optimize the program for size, avoid creating the tail loop. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); // If we don't know the precise trip count, don't try to vectorize. @@ -6303,7 +6288,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { return None; } - unsigned MaxVF = computeFeasibleMaxVF(OptForSize); + unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); if (TC % MaxVF != 0) { // If the trip count that we found modulo the vectorization factor is not @@ -6324,46 +6309,52 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { return MaxVF; } -unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) { +unsigned +LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, + unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); - unsigned MaxSafeDepDist = -1U; - // Get the maximum safe dependence distance in bits computed by LAA. If the - // loop contains any interleaved accesses, we divide the dependence distance - // by the maximum interleave factor of all interleaved groups. Note that - // although the division ensures correctness, this is a fairly conservative - // computation because the maximum distance computed by LAA may not involve - // any of the interleaved accesses. - if (Legal->getMaxSafeDepDistBytes() != -1U) - MaxSafeDepDist = - Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor(); + // Get the maximum safe dependence distance in bits computed by LAA. + // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from + // the memory accesses that is most restrictive (involved in the smallest + // dependence distance). + unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + + WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); - WidestRegister = - ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); - DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister + DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister << " bits.\n"); + assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" + " into one vector!"); if (MaxVectorSize == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; + return MaxVectorSize; + } else if (ConstTripCount && ConstTripCount < MaxVectorSize && + isPowerOf2_32(ConstTripCount)) { + // We need to clamp the VF to be the ConstTripCount. There is no point in + // choosing a higher viable VF as done in the loop below. + DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " + << ConstTripCount << "\n"); + MaxVectorSize = ConstTripCount; + return MaxVectorSize; } - assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" - " into one vector!"); - unsigned MaxVF = MaxVectorSize; if (MaximizeBandwidth && !OptForSize) { - // Collect all viable vectorization factors. + // Collect all viable vectorization factors larger than the default MaxVF + // (i.e. MaxVectorSize). SmallVector<unsigned, 8> VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; - for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) VFs.push_back(VS); // For each VF calculate its register usage. @@ -6485,7 +6476,6 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost) { - // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -6573,7 +6563,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && Legal->getReductionVars()->size()) { + if (VF > 1 && !Legal->getReductionVars()->empty()) { DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -6604,7 +6594,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) { + if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -6623,7 +6613,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = (Legal->getReductionVars()->size() > 0); + bool HasReductions = !Legal->getReductionVars()->empty(); if (TTI.enableAggressiveInterleaving(HasReductions)) { DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; @@ -6661,7 +6651,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the // instruction that is the key. - typedef DenseMap<Instruction *, unsigned> IntervalMap; + using IntervalMap = DenseMap<Instruction *, unsigned>; + // Maps instruction to its index. DenseMap<unsigned, Instruction *> IdxToInstr; // Marks the end of each interval. @@ -6700,7 +6691,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } // Saves the list of intervals that end with the index in 'key'. - typedef SmallVector<Instruction *, 2> InstrList; + using InstrList = SmallVector<Instruction *, 2>; DenseMap<unsigned, InstrList> TransposeEnds; // Transpose the EndPoints to a list of values that end at each index. @@ -6795,7 +6786,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { - // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the @@ -6829,7 +6819,6 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, unsigned VF) { - assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); @@ -6844,7 +6833,6 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Returns true if the given instruction can be scalarized. auto canBeScalarized = [&](Instruction *I) -> bool { - // We only attempt to scalarize instructions forming a single-use chain // from the original predicated block that would otherwise be vectorized. // Although not strictly necessary, we give up on instructions we know will @@ -6947,13 +6935,6 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; - // Collect Uniform and Scalar instructions after vectorization with VF. - collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - collectInstsToScalarize(VF); - // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -6965,7 +6946,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { continue; // Skip ignored values. - if (ValuesToIgnore.count(&I)) + if (ValuesToIgnore.count(&I) || + (VF > 1 && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -7004,14 +6986,16 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { static const SCEV *getAddressAccessSCEV( Value *Ptr, LoopVectorizationLegality *Legal, - ScalarEvolution *SE, + PredicatedScalarEvolution &PSE, const Loop *TheLoop) { + auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); if (!Gep) return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. + auto SE = PSE.getSE(); unsigned NumOperands = Gep->getNumOperands(); for (unsigned i = 1; i < NumOperands; ++i) { Value *Opd = Gep->getOperand(i); @@ -7021,7 +7005,7 @@ static const SCEV *getAddressAccessSCEV( } // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. - return SE->getSCEV(Ptr); + return PSE.getSCEV(Ptr); } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -7041,7 +7025,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Figure out whether the access is strided and get the stride value // if it's known in compile time - const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); @@ -7145,7 +7129,6 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, unsigned VF) { - // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF == 1) { @@ -7202,12 +7185,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // We assume that widening is the best solution when possible. if (Legal->memoryInstructionCanBeWidened(&I, VF)) { unsigned Cost = getConsecutiveMemOpCost(&I, VF); - setWideningDecision(&I, VF, CM_Widen, Cost); + int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I)); + assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && + "Expected consecutive stride."); + InstWidening Decision = + ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; + setWideningDecision(&I, VF, Decision, Cost); continue; } // Choose between Interleaving, Gather/Scatter or Scalarization. - unsigned InterleaveCost = UINT_MAX; + unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); unsigned NumAccesses = 1; if (Legal->isAccessInterleaved(&I)) { auto Group = Legal->getInterleavedAccessGroup(&I); @@ -7224,7 +7212,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { unsigned GatherScatterCost = Legal->isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses - : UINT_MAX; + : std::numeric_limits<unsigned>::max(); unsigned ScalarizationCost = getMemInstScalarizationCost(&I, VF) * NumAccesses; @@ -7282,7 +7270,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { for (auto &Op : I->operands()) if (auto *InstOp = dyn_cast<Instruction>(Op)) if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && - AddrDefs.insert(InstOp).second == true) + AddrDefs.insert(InstOp).second) Worklist.push_back(InstOp); } @@ -7292,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // by cost functions, but since this involves the task of finding out // if the loaded register is involved in an address computation, it is // instead changed here when we know this is the case. - if (getWideningDecision(I, VF) == CM_Widen) + InstWidening Decision = getWideningDecision(I, VF); + if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. setWideningDecision(I, VF, CM_Scalarize, (VF * getMemoryInstructionCost(I, 1))); @@ -7551,7 +7540,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } char LoopVectorize::ID = 0; + static const char lv_name[] = "Loop Vectorization"; + INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) @@ -7568,13 +7559,14 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { + Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { return new LoopVectorize(NoUnrolling, AlwaysVectorize); } -} -bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { +} // end namespace llvm +bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { // Check if the pointer operand of a load or store instruction is // consecutive. if (auto *Ptr = getPointerOperand(Inst)) @@ -7593,11 +7585,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } + // Ignore type-casting instructions we identified during induction + // detection. + for (auto &Induction : *Legal->getInductionVars()) { + InductionDescriptor &IndDes = Induction.second; + const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); + VecValuesToIgnore.insert(Casts.begin(), Casts.end()); + } } LoopVectorizationCostModel::VectorizationFactor LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { - // Width 1 means no vectorize, cost 0 means uncomputed cost. const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U, 0U}; @@ -7611,11 +7609,26 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); + buildVPlans(UserVF, UserVF); + DEBUG(printPlans(dbgs())); return {UserVF, 0}; } unsigned MaxVF = MaybeMaxVF.getValue(); assert(MaxVF != 0 && "MaxVF is zero."); + + for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + // Collect Uniform and Scalar instructions after vectorization with VF. + CM.collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + if (VF > 1) + CM.collectInstsToScalarize(VF); + } + + buildVPlans(1, MaxVF); + DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7623,11 +7636,28 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { +void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { + DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); + BestVF = VF; + BestUF = UF; + + erase_if(VPlans, [VF](const VPlanPtr &Plan) { + return !Plan->hasVF(VF); + }); + assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); +} + +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, + DominatorTree *DT) { // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - ILV.createVectorizedLoopSkeleton(); + VPCallbackILV CallbackILV(ILV); + + VPTransformState State{BestVF, BestUF, LI, + DT, ILV.Builder, ILV.VectorLoopValueMap, + &ILV, CallbackILV}; + State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); //===------------------------------------------------===// // @@ -7638,36 +7668,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - - // Move instructions to handle first-order recurrences. - DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter(); - for (auto &Entry : SinkAfter) { - Entry.first->removeFromParent(); - Entry.first->insertAfter(Entry.second); - DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second - << " to vectorize a 1st order recurrence.\n"); - } - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - ILV.vectorizeInstruction(I); + assert(VPlans.size() == 1 && "Not a single VPlan to execute."); + VPlans.front()->execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7691,18 +7693,23 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( for (auto &Induction : *Legal->getInductionVars()) { PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - if (all_of(IndUpdate->users(), [&](User *U) -> bool { + if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) DeadInstructions.insert(IndUpdate); - } -} - -void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { - auto *SI = dyn_cast<StoreInst>(Instr); - bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - return scalarizeInstruction(Instr, IfPredicateInstr); + // We record as "Dead" also the type-casting instructions we had identified + // during induction analysis. We don't need any handling for them in the + // vectorized loop because we have proven that, under a proper runtime + // test guarding the vectorized loop, the value of the phi, and the casted + // value of the phi, are the same. The last instruction in this casting chain + // will get its scalar/vector/widened def from the scalar/vector/widened def + // of the respective phi node. Any other casts in the induction def-use chain + // have no other uses outside the phi update chain, and will be ignored. + InductionDescriptor &IndDes = Induction.second; + const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); + DeadInstructions.insert(Casts.begin(), Casts.end()); + } } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } @@ -7760,6 +7767,722 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } +bool LoopVectorizationPlanner::getDecisionAndClampRange( + const std::function<bool(unsigned)> &Predicate, VFRange &Range) { + assert(Range.End > Range.Start && "Trying to test an empty VF range."); + bool PredicateAtRangeStart = Predicate(Range.Start); + + for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) + if (Predicate(TmpVF) != PredicateAtRangeStart) { + Range.End = TmpVF; + break; + } + + return PredicateAtRangeStart; +} + +/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, +/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range +/// of VF's starting at a given VF and extending it as much as possible. Each +/// vectorization decision can potentially shorten this sub-range during +/// buildVPlan(). +void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { + + // Collect conditions feeding internal conditional branches; they need to be + // represented in VPlan for it to model masking. + SmallPtrSet<Value *, 1> NeedDef; + + auto *Latch = OrigLoop->getLoopLatch(); + for (BasicBlock *BB : OrigLoop->blocks()) { + if (BB == Latch) + continue; + BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); + if (Branch && Branch->isConditional()) + NeedDef.insert(Branch->getCondition()); + } + + for (unsigned VF = MinVF; VF < MaxVF + 1;) { + VFRange SubRange = {VF, MaxVF + 1}; + VPlans.push_back(buildVPlan(SubRange, NeedDef)); + VF = SubRange.End; + } +} + +VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src, + BasicBlock *Dst, + VPlanPtr &Plan) { + assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); + + // Look for cached value. + std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); + EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); + if (ECEntryIt != EdgeMaskCache.end()) + return ECEntryIt->second; + + VPValue *SrcMask = createBlockInMask(Src, Plan); + + // The terminator has to be a branch inst! + BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); + assert(BI && "Unexpected terminator found"); + + if (!BI->isConditional()) + return EdgeMaskCache[Edge] = SrcMask; + + VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); + assert(EdgeMask && "No Edge Mask found for condition"); + + if (BI->getSuccessor(0) != Dst) + EdgeMask = Builder.createNot(EdgeMask); + + if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. + EdgeMask = Builder.createAnd(EdgeMask, SrcMask); + + return EdgeMaskCache[Edge] = EdgeMask; +} + +VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB, + VPlanPtr &Plan) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + + // Look for cached value. + BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); + if (BCEntryIt != BlockMaskCache.end()) + return BCEntryIt->second; + + // All-one mask is modelled as no-mask following the convention for masked + // load/store/gather/scatter. Initialize BlockMask to no-mask. + VPValue *BlockMask = nullptr; + + // Loop incoming mask is all-one. + if (OrigLoop->getHeader() == BB) + return BlockMaskCache[BB] = BlockMask; + + // This is the block mask. We OR all incoming edges. + for (auto *Predecessor : predecessors(BB)) { + VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); + if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. + return BlockMaskCache[BB] = EdgeMask; + + if (!BlockMask) { // BlockMask has its initialized nullptr value. + BlockMask = EdgeMask; + continue; + } + + BlockMask = Builder.createOr(BlockMask, EdgeMask); + } + + return BlockMaskCache[BB] = BlockMask; +} + +VPInterleaveRecipe * +LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, + VFRange &Range) { + const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); + if (!IG) + return nullptr; + + // Now check if IG is relevant for VF's in the given range. + auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { + return [=](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(I, VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + }; + if (!getDecisionAndClampRange(isIGMember(I), Range)) + return nullptr; + + // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) + // range. If it's the primary member of the IG construct a VPInterleaveRecipe. + // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. + assert(I == IG->getInsertPos() && + "Generating a recipe for an adjunct member of an interleave group"); + + return new VPInterleaveRecipe(IG); +} + +VPWidenMemoryInstructionRecipe * +LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { + if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) + return nullptr; + + auto willWiden = [&](unsigned VF) -> bool { + if (VF == 1) + return false; + if (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF)) + return false; + LoopVectorizationCostModel::InstWidening Decision = + CM.getWideningDecision(I, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point."); + assert(Decision != LoopVectorizationCostModel::CM_Interleave && + "Interleave memory opportunity should be caught earlier."); + return Decision != LoopVectorizationCostModel::CM_Scalarize; + }; + + if (!getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(I)) + Mask = createBlockInMask(I->getParent(), Plan); + + return new VPWidenMemoryInstructionRecipe(*I, Mask); +} + +VPWidenIntOrFpInductionRecipe * +LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, + VFRange &Range) { + if (PHINode *Phi = dyn_cast<PHINode>(I)) { + // Check if this is an integer or fp induction. If so, build the recipe that + // produces its scalar and vector values. + InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) + return new VPWidenIntOrFpInductionRecipe(Phi); + + return nullptr; + } + + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + + // Determine whether \p K is a truncation based on an induction variable that + // can be optimized. + auto isOptimizableIVTruncate = + [&](Instruction *K) -> std::function<bool(unsigned)> { + return + [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + }; + + if (isa<TruncInst>(I) && + getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) + return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), + cast<TruncInst>(I)); + return nullptr; +} + +VPBlendRecipe * +LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { + PHINode *Phi = dyn_cast<PHINode>(I); + if (!Phi || Phi->getParent() == OrigLoop->getHeader()) + return nullptr; + + // We know that all PHIs in non-header blocks are converted into selects, so + // we don't have to worry about the insertion order and we can just use the + // builder. At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + SmallVector<VPValue *, 2> Masks; + unsigned NumIncoming = Phi->getNumIncomingValues(); + for (unsigned In = 0; In < NumIncoming; In++) { + VPValue *EdgeMask = + createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); + assert((EdgeMask || NumIncoming == 1) && + "Multiple predecessors with one having a full mask"); + if (EdgeMask) + Masks.push_back(EdgeMask); + } + return new VPBlendRecipe(Phi, Masks); +} + +bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, + VFRange &Range) { + if (Legal->isScalarWithPredication(I)) + return false; + + auto IsVectorizableOpcode = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Add: + case Instruction::And: + case Instruction::AShr: + case Instruction::BitCast: + case Instruction::Br: + case Instruction::Call: + case Instruction::FAdd: + case Instruction::FCmp: + case Instruction::FDiv: + case Instruction::FMul: + case Instruction::FPExt: + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::FPTrunc: + case Instruction::FRem: + case Instruction::FSub: + case Instruction::GetElementPtr: + case Instruction::ICmp: + case Instruction::IntToPtr: + case Instruction::Load: + case Instruction::LShr: + case Instruction::Mul: + case Instruction::Or: + case Instruction::PHI: + case Instruction::PtrToInt: + case Instruction::SDiv: + case Instruction::Select: + case Instruction::SExt: + case Instruction::Shl: + case Instruction::SIToFP: + case Instruction::SRem: + case Instruction::Store: + case Instruction::Sub: + case Instruction::Trunc: + case Instruction::UDiv: + case Instruction::UIToFP: + case Instruction::URem: + case Instruction::Xor: + case Instruction::ZExt: + return true; + } + return false; + }; + + if (!IsVectorizableOpcode(I->getOpcode())) + return false; + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + return false; + } + + auto willWiden = [&](unsigned VF) -> bool { + if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF))) + return false; + if (CallInst *CI = dyn_cast<CallInst>(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + } + if (isa<LoadInst>(I) || isa<StoreInst>(I)) { + assert(CM.getWideningDecision(I, VF) == + LoopVectorizationCostModel::CM_Scalarize && + "Memory widening decisions should have been taken care by now"); + return false; + } + return true; + }; + + if (!getDecisionAndClampRange(willWiden, Range)) + return false; + + // Success: widen this instruction. We optimize the common case where + // consecutive instructions can be represented by a single recipe. + if (!VPBB->empty()) { + VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); + if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) + return true; + } + + VPBB->appendRecipe(new VPWidenRecipe(I)); + return true; +} + +VPBasicBlock *LoopVectorizationPlanner::handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, + VPlanPtr &Plan) { + bool IsUniform = getDecisionAndClampRange( + [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + Range); + + bool IsPredicated = Legal->isScalarWithPredication(I); + auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + + // Find if I uses a predicated instruction. If so, it will use its scalar + // value. Avoid hoisting the insert-element which packs the scalar value into + // a vector value, as that happens iff all users use the vector value. + for (auto &Op : I->operands()) + if (auto *PredInst = dyn_cast<Instruction>(Op)) + if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) + PredInst2Recipe[PredInst]->setAlsoPack(false); + + // Finalize the recipe for Instr, first if it is not predicated. + if (!IsPredicated) { + DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); + VPBB->appendRecipe(Recipe); + return VPBB; + } + DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + assert(VPBB->getSuccessors().empty() && + "VPBB has successors when handling predicated replication."); + // Record predicated instructions for above packing optimizations. + PredInst2Recipe[I] = Recipe; + VPBlockBase *Region = + VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan)); + return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock())); +} + +VPRegionBlock * +LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, + VPRecipeBase *PredRecipe, + VPlanPtr &Plan) { + // Instructions marked for predication are replicated and placed under an + // if-then construct to prevent side-effects. + + // Generate recipes to compute the block mask for this region. + VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); + + // Build the triangular if-then region. + std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); + assert(Instr->getParent() && "Predicated instruction not in any basic block"); + auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); + auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *PHIRecipe = + Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); + auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); + VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); + + // Note: first set Entry as region entry and then connect successors starting + // from it in order, to propagate the "parent" of each VPBasicBlock. + Entry->setTwoSuccessors(Pred, Exit); + Pred->setOneSuccessor(Exit); + + return Region; +} + +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::buildVPlan(VFRange &Range, + const SmallPtrSetImpl<Value *> &NeedDef) { + EdgeMaskCache.clear(); + BlockMaskCache.clear(); + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + DenseMap<Instruction *, Instruction *> SinkAfterInverse; + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Hold a mapping from predicated instructions to their recipes, in order to + // fix their AlsoPack behavior if a user is determined to replicate and use a + // scalar instead of vector value. + DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; + + // Create a dummy pre-entry VPBasicBlock to start building the VPlan. + VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); + auto Plan = llvm::make_unique<VPlan>(VPBB); + + // Represent values that will have defs inside VPlan. + for (Value *V : NeedDef) + Plan->addVPValue(V); + + // Scan the body of the loop in a topological order to visit each basic block + // after having visited its predecessor basic blocks. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { + // Relevant instructions from basic block BB will be grouped into VPRecipe + // ingredients and fill a new VPBasicBlock. + unsigned VPBBsForBB = 0; + auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); + VPBB->setOneSuccessor(FirstVPBBForBB); + VPBB = FirstVPBBForBB; + Builder.setInsertPoint(VPBB); + + std::vector<Instruction *> Ingredients; + + // Organize the ingredients to vectorize from current basic block in the + // right order. + for (Instruction &I : *BB) { + Instruction *Instr = &I; + + // First filter out irrelevant instructions, to ensure no recipes are + // built for them. + if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) || + DeadInstructions.count(Instr)) + continue; + + // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct + // member of the IG, do not construct any Recipe for it. + const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); + if (IG && Instr != IG->getInsertPos() && + Range.Start >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(Instr, Range.Start) == + LoopVectorizationCostModel::CM_Interleave) { + if (SinkAfterInverse.count(Instr)) + Ingredients.push_back(SinkAfterInverse.find(Instr)->second); + continue; + } + + // Move instructions to handle first-order recurrences, step 1: avoid + // handling this instruction until after we've handled the instruction it + // should follow. + auto SAIt = SinkAfter.find(Instr); + if (SAIt != SinkAfter.end()) { + DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second + << " to vectorize a 1st order recurrence.\n"); + SinkAfterInverse[SAIt->second] = Instr; + continue; + } + + Ingredients.push_back(Instr); + + // Move instructions to handle first-order recurrences, step 2: push the + // instruction to be sunk at its insertion point. + auto SAInvIt = SinkAfterInverse.find(Instr); + if (SAInvIt != SinkAfterInverse.end()) + Ingredients.push_back(SAInvIt->second); + } + + // Introduce each ingredient into VPlan. + for (Instruction *Instr : Ingredients) { + VPRecipeBase *Recipe = nullptr; + + // Check if Instr should belong to an interleave memory recipe, or already + // does. In the latter case Instr is irrelevant. + if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + + // Check if Instr is a memory operation that should be widened. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + VPBB->appendRecipe(Recipe); + continue; + } + + // Check if Instr should form some PHI recipe. + if ((Recipe = tryToOptimizeInduction(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + if ((Recipe = tryToBlend(Instr, Plan))) { + VPBB->appendRecipe(Recipe); + continue; + } + if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { + VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); + continue; + } + + // Check if Instr is to be widened by a general VPWidenRecipe, after + // having first checked for specific widening recipes that deal with + // Interleave Groups, Inductions and Phi nodes. + if (tryToWiden(Instr, VPBB, Range)) + continue; + + // Otherwise, if all widening options failed, Instruction is to be + // replicated. This may create a successor for VPBB. + VPBasicBlock *NextVPBB = + handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan); + if (NextVPBB != VPBB) { + VPBB = NextVPBB; + VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) + : ""); + } + } + } + + // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks + // may also be empty, such as the last one VPBB, reflecting original + // basic-blocks with no recipes. + VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); + assert(PreEntry->empty() && "Expecting empty pre-entry block."); + VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); + PreEntry->disconnectSuccessor(Entry); + delete PreEntry; + + std::string PlanName; + raw_string_ostream RSO(PlanName); + unsigned VF = Range.Start; + Plan->addVF(VF); + RSO << "Initial VPlan for VF={" << VF; + for (VF *= 2; VF < Range.End; VF *= 2) { + Plan->addVF(VF); + RSO << "," << VF; + } + RSO << "},UF>=1"; + RSO.flush(); + Plan->setName(PlanName); + + return Plan; +} + +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" + << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << "\\l\""; + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *I = IG->getMember(i)) + O << " +\n" + << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; +} + +void VPWidenRecipe::execute(VPTransformState &State) { + for (auto &Instr : make_range(Begin, End)) + State.ILV->widenInstruction(Instr); +} + +void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Int or FP induction being replicated."); + State.ILV->widenIntOrFpInduction(IV, Trunc); +} + +void VPWidenPHIRecipe::execute(VPTransformState &State) { + State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); +} + +void VPBlendRecipe::execute(VPTransformState &State) { + State.ILV->setDebugLocFromInst(State.Builder, Phi); + // We know that all PHIs in non-header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = Phi->getNumIncomingValues(); + + assert((User || NumIncoming == 1) && + "Multiple predecessors with predecessors having a full mask"); + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + InnerLoopVectorizer::VectorParts Entry(State.UF); + for (unsigned In = 0; In < NumIncoming; ++In) { + for (unsigned Part = 0; Part < State.UF; ++Part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + Value *In0 = + State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); + if (In == 0) + Entry[Part] = In0; // Initialize with the first incoming value. + else { + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Value *Cond = State.get(User->getOperand(In), Part); + Entry[Part] = + State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); + } + } + } + for (unsigned Part = 0; Part < State.UF; ++Part) + State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); +} + +void VPInterleaveRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Interleave group being replicated."); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); +} + +void VPReplicateRecipe::execute(VPTransformState &State) { + if (State.Instance) { // Generate a single instance. + State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); + // Insert scalar instance packing it into a vector. + if (AlsoPack && State.VF > 1) { + // If we're constructing lane 0, initialize to start from undef. + if (State.Instance->Lane == 0) { + Value *Undef = + UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); + State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); + } + State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); + } + return; + } + + // Generate scalar instances for all VF lanes of all UF parts, unless the + // instruction is uniform inwhich case generate only the first lane for each + // of the UF parts. + unsigned EndLane = IsUniform ? 1 : State.VF; + for (unsigned Part = 0; Part < State.UF; ++Part) + for (unsigned Lane = 0; Lane < EndLane; ++Lane) + State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); +} + +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Branch on Mask works only on single instance."); + + unsigned Part = State.Instance->Part; + unsigned Lane = State.Instance->Lane; + + Value *ConditionBit = nullptr; + if (!User) // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); + else { + VPValue *BlockInMask = User->getOperand(0); + ConditionBit = State.get(BlockInMask, Part); + if (ConditionBit->getType()->isVectorTy()) + ConditionBit = State.Builder.CreateExtractElement( + ConditionBit, State.Builder.getInt32(Lane)); + } + + // Replace the temporary unreachable terminator with a new conditional branch, + // whose two destinations will be set later when they are created. + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + assert(isa<UnreachableInst>(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional branch."); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); +} + +void VPPredInstPHIRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Predicated instruction PHI works per instance."); + Instruction *ScalarPredInst = cast<Instruction>( + State.ValueMap.getScalarValue(PredInst, *State.Instance)); + BasicBlock *PredicatedBB = ScalarPredInst->getParent(); + BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); + assert(PredicatingBB && "Predicated block has no single predecessor."); + + // By current pack/unpack logic we need to generate only a single phi node: if + // a vector value for the predicated instruction exists at this point it means + // the instruction has vector users only, and a phi for the vector value is + // needed. In this case the recipe of the predicated instruction is marked to + // also do that packing, thereby "hoisting" the insert-element sequence. + // Otherwise, a phi node for the scalar value is needed. + unsigned Part = State.Instance->Part; + if (State.ValueMap.hasVectorValue(PredInst, Part)) { + Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); + InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); + PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); + VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. + VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. + State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. + } else { + Type *PredInstType = PredInst->getType(); + PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); + Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(ScalarPredInst, PredicatedBB); + State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); + } +} + +void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { + if (!User) + return State.ILV->vectorizeMemoryInstruction(&Instr); + + // Last (and currently only) operand is a mask. + InnerLoopVectorizer::VectorParts MaskValues(State.UF); + VPValue *Mask = User->getOperand(User->getNumOperands() - 1); + for (unsigned Part = 0; Part < State.UF; ++Part) + MaskValues[Part] = State.get(Mask, Part); + State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); +} + bool LoopVectorizePass::processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -7878,7 +8601,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7941,48 +8664,61 @@ bool LoopVectorizePass::processLoop(Loop *L) { const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { // Do not vectorize or interleaving the loop. - ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, - L->getStartLoc(), L->getHeader()) - << VecDiagMsg.second); - ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, - L->getStartLoc(), L->getHeader()) - << IntDiagMsg.second); + ORE->emit([&]() { + return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << VecDiagMsg.second; + }); + ORE->emit([&]() { + return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << IntDiagMsg.second; + }); return false; } else if (!VectorizeLoop && InterleaveLoop) { DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); - ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, - L->getStartLoc(), L->getHeader()) - << VecDiagMsg.second); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << VecDiagMsg.second; + }); } else if (VectorizeLoop && !InterleaveLoop) { DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); - ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, - L->getStartLoc(), L->getHeader()) - << IntDiagMsg.second); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << IntDiagMsg.second; + }); } else if (VectorizeLoop && InterleaveLoop) { DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } + LVP.setBestPlan(VF.Width, IC); + using namespace ore; + if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - LVP.executePlan(Unroller); + LVP.executePlan(Unroller, DT); - ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"); + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LVP.executePlan(LB); + LVP.executePlan(LB, DT); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are @@ -7992,11 +8728,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"); + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; + }); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -8012,7 +8750,6 @@ bool LoopVectorizePass::runImpl( DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE_) { - SE = &SE_; LI = &LI_; TTI = &TTI_; @@ -8068,10 +8805,8 @@ bool LoopVectorizePass::runImpl( // Process each loop nest in the function. return Changed; - } - PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); @@ -8088,7 +8823,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; bool Changed = diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index dcbcab459a6b..76ba62f5d596 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6,6 +6,7 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// // This pass implements the Bottom Up SLP vectorizer. It detects consecutive // stores that can be put together into vector-stores. Next, it attempts to // construct vectorizable tree using the use-def chains. If a profitable tree @@ -15,39 +16,89 @@ // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// + #include "llvm/Transforms/Vectorize/SLPVectorizer.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DOTGraphTraits.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> #include <memory> +#include <set> +#include <string> +#include <tuple> +#include <utility> +#include <vector> using namespace llvm; +using namespace llvm::PatternMatch; using namespace slpvectorizer; #define SV_NAME "slp-vectorizer" @@ -156,6 +207,119 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } +/// Checks if the vector of instructions can be represented as a shuffle, like: +/// %x0 = extractelement <4 x i8> %x, i32 0 +/// %x3 = extractelement <4 x i8> %x, i32 3 +/// %y1 = extractelement <4 x i8> %y, i32 1 +/// %y2 = extractelement <4 x i8> %y, i32 2 +/// %x0x0 = mul i8 %x0, %x0 +/// %x3x3 = mul i8 %x3, %x3 +/// %y1y1 = mul i8 %y1, %y1 +/// %y2y2 = mul i8 %y2, %y2 +/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0 +/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 +/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 +/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 +/// ret <4 x i8> %ins4 +/// can be transformed into: +/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, +/// i32 6> +/// %2 = mul <4 x i8> %1, %1 +/// ret <4 x i8> %2 +/// We convert this initially to something like: +/// %x0 = extractelement <4 x i8> %x, i32 0 +/// %x3 = extractelement <4 x i8> %x, i32 3 +/// %y1 = extractelement <4 x i8> %y, i32 1 +/// %y2 = extractelement <4 x i8> %y, i32 2 +/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0 +/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 +/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 +/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 +/// %5 = mul <4 x i8> %4, %4 +/// %6 = extractelement <4 x i8> %5, i32 0 +/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0 +/// %7 = extractelement <4 x i8> %5, i32 1 +/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 +/// %8 = extractelement <4 x i8> %5, i32 2 +/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 +/// %9 = extractelement <4 x i8> %5, i32 3 +/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 +/// ret <4 x i8> %ins4 +/// InstCombiner transforms this into a shuffle and vector mul +static Optional<TargetTransformInfo::ShuffleKind> +isShuffle(ArrayRef<Value *> VL) { + auto *EI0 = cast<ExtractElementInst>(VL[0]); + unsigned Size = EI0->getVectorOperandType()->getVectorNumElements(); + Value *Vec1 = nullptr; + Value *Vec2 = nullptr; + enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute}; + ShuffleMode CommonShuffleMode = Unknown; + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + auto *EI = cast<ExtractElementInst>(VL[I]); + auto *Vec = EI->getVectorOperand(); + // All vector operands must have the same number of vector elements. + if (Vec->getType()->getVectorNumElements() != Size) + return None; + auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); + if (!Idx) + return None; + // Undefined behavior if Idx is negative or >= Size. + if (Idx->getValue().uge(Size)) + continue; + unsigned IntIdx = Idx->getValue().getZExtValue(); + // We can extractelement from undef vector. + if (isa<UndefValue>(Vec)) + continue; + // For correct shuffling we have to have at most 2 different vector operands + // in all extractelement instructions. + if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2) + return None; + if (CommonShuffleMode == Permute) + continue; + // If the extract index is not the same as the operation number, it is a + // permutation. + if (IntIdx != I) { + CommonShuffleMode = Permute; + continue; + } + // Check the shuffle mode for the current operation. + if (!Vec1) + Vec1 = Vec; + else if (Vec != Vec1) + Vec2 = Vec; + // Example: shufflevector A, B, <0,5,2,7> + // I is odd and IntIdx for A == I - FirstAlternate shuffle. + // I is even and IntIdx for B == I - FirstAlternate shuffle. + // Example: shufflevector A, B, <4,1,6,3> + // I is even and IntIdx for A == I - SecondAlternate shuffle. + // I is odd and IntIdx for B == I - SecondAlternate shuffle. + const bool IIsEven = I & 1; + const bool CurrVecIsA = Vec == Vec1; + const bool IIsOdd = !IIsEven; + const bool CurrVecIsB = !CurrVecIsA; + ShuffleMode CurrentShuffleMode = + ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate + : SecondAlternate; + // Common mode is not set or the same as the shuffle mode of the current + // operation - alternate. + if (CommonShuffleMode == Unknown) + CommonShuffleMode = CurrentShuffleMode; + // Common shuffle mode is not the same as the shuffle mode of the current + // operation - permutation. + if (CommonShuffleMode != CurrentShuffleMode) + CommonShuffleMode = Permute; + } + // If we're not crossing lanes in different vectors, consider it as blending. + if ((CommonShuffleMode == FirstAlternate || + CommonShuffleMode == SecondAlternate) && + Vec2) + return TargetTransformInfo::SK_Alternate; + // If Vec2 was never used, we have a permutation of a single vector, otherwise + // we have permutation of 2 vectors. + return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc + : TargetTransformInfo::SK_PermuteSingleSrc; +} + ///\returns Opcode that can be clubbed with \p Op to create an alternate /// sequence which can later be merged as a ShuffleVector instruction. static unsigned getAltOpcode(unsigned Op) { @@ -173,50 +337,107 @@ static unsigned getAltOpcode(unsigned Op) { } } -/// true if the \p Value is odd, false otherwise. static bool isOdd(unsigned Value) { return Value & 1; } -///\returns bool representing if Opcode \p Op can be part -/// of an alternate sequence which can later be merged as -/// a ShuffleVector instruction. -static bool canCombineAsAltInst(unsigned Op) { - return Op == Instruction::FAdd || Op == Instruction::FSub || - Op == Instruction::Sub || Op == Instruction::Add; +static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, + unsigned CheckedOpcode) { + return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; } -/// \returns ShuffleVector instruction if instructions in \p VL have -/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. -/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) -static unsigned isAltInst(ArrayRef<Value *> VL) { - Instruction *I0 = dyn_cast<Instruction>(VL[0]); - unsigned Opcode = I0->getOpcode(); - unsigned AltOpcode = getAltOpcode(Opcode); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode)) - return 0; - } - return Instruction::ShuffleVector; +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *isOneOf(Value *OpValue, Value *Op) { + auto *I = dyn_cast<Instruction>(Op); + if (!I) + return OpValue; + auto *OpInst = cast<Instruction>(OpValue); + unsigned OpInstOpcode = OpInst->getOpcode(); + unsigned IOpcode = I->getOpcode(); + if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode)) + return Op; + return OpValue; } -/// \returns The opcode if all of the Instructions in \p VL have the same -/// opcode, or zero. -static unsigned getSameOpcode(ArrayRef<Value *> VL) { - Instruction *I0 = dyn_cast<Instruction>(VL[0]); +namespace { + +/// Contains data for the instructions going to be vectorized. +struct RawInstructionsData { + /// Main Opcode of the instructions going to be vectorized. + unsigned Opcode = 0; + + /// The list of instructions have some instructions with alternate opcodes. + bool HasAltOpcodes = false; +}; + +} // end anonymous namespace + +/// Checks the list of the vectorized instructions \p VL and returns info about +/// this list. +static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) { + auto *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) - return 0; + return {}; + RawInstructionsData Res; unsigned Opcode = I0->getOpcode(); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I || Opcode != I->getOpcode()) { - if (canCombineAsAltInst(Opcode) && i == 1) - return isAltInst(VL); - return 0; + // Walk through the list of the vectorized instructions + // in order to check its structure described by RawInstructionsData. + for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { + auto *I = dyn_cast<Instruction>(VL[Cnt]); + if (!I) + return {}; + if (Opcode != I->getOpcode()) + Res.HasAltOpcodes = true; + } + Res.Opcode = Opcode; + return Res; +} + +namespace { + +/// Main data required for vectorization of instructions. +struct InstructionsState { + /// The very first instruction in the list with the main opcode. + Value *OpValue = nullptr; + + /// The main opcode for the list of instructions. + unsigned Opcode = 0; + + /// Some of the instructions in the list have alternate opcodes. + bool IsAltShuffle = false; + + InstructionsState() = default; + InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle) + : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {} +}; + +} // end anonymous namespace + +/// \returns analysis of the Instructions in \p VL described in +/// InstructionsState, the Opcode that we suppose the whole list +/// could be vectorized even if its structure is diverse. +static InstructionsState getSameOpcode(ArrayRef<Value *> VL) { + auto Res = getMainOpcode(VL); + unsigned Opcode = Res.Opcode; + if (!Res.HasAltOpcodes) + return InstructionsState(VL[0], Opcode, false); + auto *OpInst = cast<Instruction>(VL[0]); + unsigned AltOpcode = getAltOpcode(Opcode); + // Examine each element in the list instructions VL to determine + // if some operations there could be considered as an alternative + // (for example as subtraction relates to addition operation). + for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + auto *I = cast<Instruction>(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if ((Res.HasAltOpcodes && + InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || + (!Res.HasAltOpcodes && InstOpcode != Opcode)) { + return InstructionsState(OpInst, 0, false); } } - return Opcode; + return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes); } /// \returns true if all of the values in \p VL have the same type or false @@ -247,7 +468,6 @@ static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI) { - unsigned Opcode = UserInst->getOpcode(); switch (Opcode) { case Instruction::Load: { @@ -292,24 +512,25 @@ static bool isSimple(Instruction *I) { } namespace llvm { + namespace slpvectorizer { + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: - typedef SmallVector<Value *, 8> ValueList; - typedef SmallVector<Instruction *, 16> InstrList; - typedef SmallPtrSet<Value *, 16> ValueSet; - typedef SmallVector<StoreInst *, 8> StoreList; - typedef MapVector<Value *, SmallVector<Instruction *, 2>> - ExtraValueToDebugLocsMap; + using ValueList = SmallVector<Value *, 8>; + using InstrList = SmallVector<Instruction *, 16>; + using ValueSet = SmallPtrSet<Value *, 16>; + using StoreList = SmallVector<StoreInst *, 8>; + using ExtraValueToDebugLocsMap = + MapVector<Value *, SmallVector<Instruction *, 2>>; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) - : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), - DL(DL), ORE(ORE), Builder(Se->getContext()) { + : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), + DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -331,6 +552,7 @@ public: /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); + /// Vectorize the tree but with the list of externally used values \p /// ExternallyUsedValues. Values in this MapVector can be replaced but the /// generated extractvalue instructions. @@ -348,6 +570,7 @@ public: /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst = None); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking /// into account (anf updating it, if required) list of externally used @@ -374,7 +597,7 @@ public: unsigned getTreeSize() const { return VectorizableTree.size(); } /// \brief Perform LICM and CSE on the newly generated gather sequences. - void optimizeGatherSequence(); + void optimizeGatherSequence(Function &F); /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { @@ -416,21 +639,30 @@ public: private: struct TreeEntry; + /// Checks if all users of \p I are the part of the vectorization tree. + bool areAllUsersVectorized(Instruction *I) const; + /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1, + int OpdNum = 0); /// \returns True if the ExtractElement/ExtractValue instructions in VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). - bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const; + bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const; - /// Vectorize a single entry in the tree. - Value *vectorizeTree(TreeEntry *E); + /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of + /// operand corrsponding to this tree entry \p E for the user tree entry + /// indicated by \p UserIndx. + // In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)". + Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1); - /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef<Value *> VL); + /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate + /// the ordinality of operand corrsponding to the \p VL of scalar values for the + /// user indicated by \p UserIndx this \p VL feeds into. + Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1); /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. @@ -447,7 +679,7 @@ private: /// \brief Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef<Value *> VL); + void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -458,18 +690,17 @@ private: /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(ArrayRef<Value *> VL, + void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right); + /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right); struct TreeEntry { - TreeEntry(std::vector<TreeEntry> &Container) - : Scalars(), VectorizedValue(nullptr), NeedToGather(0), - Container(Container) {} + TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { @@ -477,14 +708,32 @@ private: return std::equal(VL.begin(), VL.end(), Scalars.begin()); } + /// \returns true if the scalars in VL are found in this tree entry. + bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL, + ScalarEvolution &SE) const { + assert(VL.size() == Scalars.size() && "Invalid size"); + SmallVector<Value *, 8> List; + if (!sortLoadAccesses(VL, DL, SE, List)) + return false; + return std::equal(List.begin(), List.end(), Scalars.begin()); + } + /// A vector of scalars. ValueList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. - Value *VectorizedValue; + Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - bool NeedToGather; + bool NeedToGather = false; + + /// Records optional shuffle mask for the uses of jumbled memory accesses. + /// For example, a non-empty ShuffleMask[1] represents the permutation of + /// lanes that operand #1 of this vectorized instruction should undergo + /// before feeding this vectorized instruction, whereas an empty + /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized + /// instruction need not be permuted at all. + SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask; /// Points back to the VectorizableTree. /// @@ -501,12 +750,31 @@ private: /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, - int &UserTreeIdx) { + int &UserTreeIdx, const InstructionsState &S, + ArrayRef<unsigned> ShuffleMask = None, + int OpdNum = 0) { + assert((!Vectorized || S.Opcode != 0) && + "Vectorized TreeEntry without opcode"); VectorizableTree.emplace_back(VectorizableTree); + int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; + + TreeEntry *UserTreeEntry = nullptr; + if (UserTreeIdx != -1) + UserTreeEntry = &VectorizableTree[UserTreeIdx]; + + if (UserTreeEntry && !ShuffleMask.empty()) { + if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size()) + UserTreeEntry->ShuffleMask.resize(OpdNum + 1); + assert(UserTreeEntry->ShuffleMask[OpdNum].empty() && + "Mask already present"); + using mask = SmallVector<unsigned, 4>; + mask tempMask(ShuffleMask.begin(), ShuffleMask.end()); + UserTreeEntry->ShuffleMask[OpdNum] = tempMask; + } if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -548,16 +816,19 @@ private: /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { - ExternalUser (Value *S, llvm::User *U, int L) : - Scalar(S), User(U), Lane(L){} + ExternalUser(Value *S, llvm::User *U, int L) + : Scalar(S), User(U), Lane(L) {} + // Which scalar in our function. Value *Scalar; + // Which user that uses the scalar. llvm::User *User; + // Which lane does the scalar belong to. int Lane; }; - typedef SmallVector<ExternalUser, 16> UserList; + using UserList = SmallVector<ExternalUser, 16>; /// Checks if two instructions may access the same memory. /// @@ -565,7 +836,6 @@ private: /// is invariant in the calling loop. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { - // First check if the result is already in the cache. AliasCacheKey key = std::make_pair(Inst1, Inst2); Optional<bool> &result = AliasCache[key]; @@ -583,7 +853,7 @@ private: return aliased; } - typedef std::pair<Instruction *, Instruction *> AliasCacheKey; + using AliasCacheKey = std::pair<Instruction *, Instruction *>; /// Cache for alias results. /// TODO: consider moving this to the AliasAnalysis itself. @@ -616,6 +886,7 @@ private: /// Holds all of the instructions that we gathered. SetVector<Instruction *> GatherSeq; + /// A list of blocks that we are going to CSE. SetVector<BasicBlock *> CSEBlocks; @@ -624,18 +895,13 @@ private: /// instruction bundle (= a group of instructions which is combined into a /// vector instruction). struct ScheduleData { - // The initial value for the dependency counters. It means that the // dependencies are not calculated yet. enum { InvalidDeps = -1 }; - ScheduleData() - : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), - NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), - Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), - UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} + ScheduleData() = default; - void init(int BlockSchedulingRegionID) { + void init(int BlockSchedulingRegionID, Value *OpVal) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; @@ -643,6 +909,7 @@ private: SchedulingRegionID = BlockSchedulingRegionID; UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); + OpValue = OpVal; } /// Returns true if the dependency information has been calculated. @@ -702,19 +969,19 @@ private: } } - Instruction *Inst; + Instruction *Inst = nullptr; /// Points to the head in an instruction bundle (and always to this for /// single instructions). - ScheduleData *FirstInBundle; + ScheduleData *FirstInBundle = nullptr; /// Single linked list of all instructions in a bundle. Null if it is a /// single instruction. - ScheduleData *NextInBundle; + ScheduleData *NextInBundle = nullptr; /// Single linked list of all memory instructions (e.g. load, store, call) /// in the block - until the end of the scheduling region. - ScheduleData *NextLoadStore; + ScheduleData *NextLoadStore = nullptr; /// The dependent memory instructions. /// This list is derived on demand in calculateDependencies(). @@ -722,31 +989,33 @@ private: /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. - int SchedulingRegionID; + int SchedulingRegionID = 0; /// Used for getting a "good" final ordering of instructions. - int SchedulingPriority; + int SchedulingPriority = 0; /// The number of dependencies. Constitutes of the number of users of the /// instruction plus the number of dependent memory instructions (if any). /// This value is calculated on demand. /// If InvalidDeps, the number of dependencies is not calculated yet. - /// - int Dependencies; + int Dependencies = InvalidDeps; /// The number of dependencies minus the number of dependencies of scheduled /// instructions. As soon as this is zero, the instruction/bundle gets ready /// for scheduling. /// Note that this is negative as long as Dependencies is not calculated. - int UnscheduledDeps; + int UnscheduledDeps = InvalidDeps; /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for /// single instructions. - int UnscheduledDepsInBundle; + int UnscheduledDepsInBundle = InvalidDeps; /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). - bool IsScheduled; + bool IsScheduled = false; + + /// Opcode of the current instruction in the schedule data. + Value *OpValue = nullptr; }; #ifndef NDEBUG @@ -756,22 +1025,14 @@ private: return os; } #endif + friend struct GraphTraits<BoUpSLP *>; friend struct DOTGraphTraits<BoUpSLP *>; /// Contains all scheduling data for a basic block. - /// struct BlockScheduling { - BlockScheduling(BasicBlock *BB) - : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), - ScheduleStart(nullptr), ScheduleEnd(nullptr), - FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), - ScheduleRegionSize(0), - ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), - // Make sure that the initial SchedulingRegionID is greater than the - // initial SchedulingRegionID in ScheduleData (which is 0). - SchedulingRegionID(1) {} + : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} void clear() { ReadyInsts.clear(); @@ -799,6 +1060,18 @@ private: return nullptr; } + ScheduleData *getScheduleData(Value *V, Value *Key) { + if (V == Key) + return getScheduleData(V); + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) { + ScheduleData *SD = I->second[Key]; + if (SD && SD->SchedulingRegionID == SchedulingRegionID) + return SD; + } + return nullptr; + } + bool isInSchedulingRegion(ScheduleData *SD) { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -812,19 +1085,29 @@ private: ScheduleData *BundleMember = SD; while (BundleMember) { + if (BundleMember->Inst != BundleMember->OpValue) { + BundleMember = BundleMember->NextInBundle; + continue; + } // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { - ScheduleData *OpDef = getScheduleData(U.get()); - if (OpDef && OpDef->hasValidDependencies() && - OpDef->incrementUnscheduledDeps(-1) == 0) { - // There are no more unscheduled dependencies after decrementing, - // so we can put the dependent instruction into the ready list. - ScheduleData *DepBundle = OpDef->FirstInBundle; - assert(!DepBundle->IsScheduled && - "already scheduled bundle gets ready"); - ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); - } + auto *I = dyn_cast<Instruction>(U.get()); + if (!I) + continue; + doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { + if (OpDef && OpDef->hasValidDependencies() && + OpDef->incrementUnscheduledDeps(-1) == 0) { + // There are no more unscheduled dependencies after + // decrementing, so we can put the dependent instruction + // into the ready list. + ScheduleData *DepBundle = OpDef->FirstInBundle; + assert(!DepBundle->IsScheduled && + "already scheduled bundle gets ready"); + ReadyList.insert(DepBundle); + DEBUG(dbgs() + << "SLP: gets ready (def): " << *DepBundle << "\n"); + } + }); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -835,22 +1118,35 @@ private: assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); + DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle + << "\n"); } } BundleMember = BundleMember->NextInBundle; } } + void doForAllOpcodes(Value *V, + function_ref<void(ScheduleData *SD)> Action) { + if (ScheduleData *SD = getScheduleData(V)) + Action(SD); + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) + for (auto &P : I->second) + if (P.second->SchedulingRegionID == SchedulingRegionID) + Action(P.second); + } + /// Put all instructions into the ReadyList which are ready for scheduling. template <typename ReadyListType> void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - if (SD->isSchedulingEntity() && SD->isReady()) { - ReadyList.insert(SD); - DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); - } + doForAllOpcodes(I, [&](ScheduleData *SD) { + if (SD->isSchedulingEntity() && SD->isReady()) { + ReadyList.insert(SD); + DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); + } + }); } } @@ -862,9 +1158,12 @@ private: /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); + /// Allocates schedule data chunk. + ScheduleData *allocateScheduleDataChunks(); + /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V); + bool extendSchedulingRegion(Value *V, Value *OpValue); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -897,6 +1196,10 @@ private: /// ScheduleData structures are recycled. DenseMap<Value *, ScheduleData *> ScheduleDataMap; + /// Attaches ScheduleData to Instruction with the leading key. + DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> + ExtraScheduleDataMap; + struct ReadyList : SmallVector<ScheduleData *, 8> { void insert(ScheduleData *SD) { push_back(SD); } }; @@ -905,28 +1208,30 @@ private: ReadyList ReadyInsts; /// The first instruction of the scheduling region. - Instruction *ScheduleStart; + Instruction *ScheduleStart = nullptr; /// The first instruction _after_ the scheduling region. - Instruction *ScheduleEnd; + Instruction *ScheduleEnd = nullptr; /// The first memory accessing instruction in the scheduling region /// (can be null). - ScheduleData *FirstLoadStoreInRegion; + ScheduleData *FirstLoadStoreInRegion = nullptr; /// The last memory accessing instruction in the scheduling region /// (can be null). - ScheduleData *LastLoadStoreInRegion; + ScheduleData *LastLoadStoreInRegion = nullptr; /// The current size of the scheduling region. - int ScheduleRegionSize; + int ScheduleRegionSize = 0; /// The maximum size allowed for the scheduling region. - int ScheduleRegionSizeLimit; + int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. - int SchedulingRegionID; + // Make sure that the initial SchedulingRegionID is greater than the + // initial SchedulingRegionID in ScheduleData (which is 0). + int SchedulingRegionID = 1; }; /// Attaches the BlockScheduling structures to basic blocks. @@ -940,10 +1245,10 @@ private: ArrayRef<Value *> UserIgnoreList; // Number of load bundles that contain consecutive loads. - int NumLoadsWantToKeepOrder; + int NumLoadsWantToKeepOrder = 0; // Number of load bundles that contain consecutive loads in reversed order. - int NumLoadsWantToChangeOrder; + int NumLoadsWantToChangeOrder = 0; // Analysis and block reference. Function *F; @@ -960,6 +1265,7 @@ private: unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). + /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; @@ -970,20 +1276,20 @@ private: /// original width. MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; }; + } // end namespace slpvectorizer template <> struct GraphTraits<BoUpSLP *> { - typedef BoUpSLP::TreeEntry TreeEntry; + using TreeEntry = BoUpSLP::TreeEntry; /// NodeRef has to be a pointer per the GraphWriter. - typedef TreeEntry *NodeRef; + using NodeRef = TreeEntry *; /// \brief Add the VectorizableTree to the index iterator to be able to return /// TreeEntry pointers. struct ChildIteratorType : public iterator_adaptor_base<ChildIteratorType, SmallVector<int, 1>::iterator> { - std::vector<TreeEntry> &VectorizableTree; ChildIteratorType(SmallVector<int, 1>::iterator W, @@ -998,17 +1304,19 @@ template <> struct GraphTraits<BoUpSLP *> { static ChildIteratorType child_begin(NodeRef N) { return {N->UserTreeIndices.begin(), N->Container}; } + static ChildIteratorType child_end(NodeRef N) { return {N->UserTreeIndices.end(), N->Container}; } /// For the node iterator we just need to turn the TreeEntry iterator into a /// TreeEntry* iterator so that it dereferences to NodeRef. - typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator; + using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>; static nodes_iterator nodes_begin(BoUpSLP *R) { return nodes_iterator(R->VectorizableTree.begin()); } + static nodes_iterator nodes_end(BoUpSLP *R) { return nodes_iterator(R->VectorizableTree.end()); } @@ -1017,7 +1325,7 @@ template <> struct GraphTraits<BoUpSLP *> { }; template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { - typedef BoUpSLP::TreeEntry TreeEntry; + using TreeEntry = BoUpSLP::TreeEntry; DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} @@ -1054,6 +1362,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ExtraValueToDebugLocsMap ExternallyUsedValues; buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); } + void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef<Value *> UserIgnoreLst) { @@ -1118,44 +1427,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, - int UserTreeIdx) { - bool isAltShuffle = false; + int UserTreeIdx, int OpdNum) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) { + if (S.OpValue->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } - unsigned Opcode = getSameOpcode(VL); - - // Check that this shuffle vector refers to the alternate - // sequence of opcodes. - if (Opcode == Instruction::ShuffleVector) { - Instruction *I0 = dyn_cast<Instruction>(VL[0]); - unsigned Op = I0->getOpcode(); - if (Op != Instruction::ShuffleVector) - isAltShuffle = true; - } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { + if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1167,87 +1466,92 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } // Check if this is a duplicate of another entry. - if (TreeEntry *E = getTreeEntry(VL[0])) { + if (TreeEntry *E = getTreeEntry(S.OpValue)) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } // Record the reuse of the tree node. FIXME, currently this is only used to // properly draw the graph rather than for the actual vectorization. E->UserTreeIndices.push_back(UserTreeIdx); - DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); return; } // Check that none of the instructions in the bundle are already in the tree. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (ScalarToTreeEntry.count(VL[i])) { + auto *I = dyn_cast<Instruction>(VL[i]); + if (!I) + continue; + if (getTreeEntry(I)) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } - // If any of the scalars is marked as a value that needs to stay scalar then + // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } // Check that all of the users of the scalars that we want to vectorize are // schedulable. - Instruction *VL0 = cast<Instruction>(VL[0]); + auto *VL0 = cast<Instruction>(S.OpValue); BasicBlock *BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } - // Check that every instructions appears once in this bundle. + // Check that every instruction appears once in this bundle. for (unsigned i = 0, e = VL.size(); i < e; ++i) - for (unsigned j = i+1; j < e; ++j) + for (unsigned j = i + 1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } auto &BSRef = BlocksSchedules[BB]; - if (!BSRef) { + if (!BSRef) BSRef = llvm::make_unique<BlockScheduling>(BB); - } + BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, VL0)) { + if (!BS.tryScheduleBundle(VL, this, S.OpValue)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL[0]) || - !BS.getScheduleData(VL[0])->isPartOfBundle()) && + assert((!BS.getScheduleData(VL0) || + !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - switch (Opcode) { + unsigned ShuffleOrOp = S.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : S.Opcode; + switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast<PHINode>(VL0); @@ -1259,12 +1563,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1274,35 +1578,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; } case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = canReuseExtract(VL, Opcode); + bool Reuse = canReuseExtract(VL, VL0); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); } else { BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx); + newTreeEntry(VL, Reuse, UserTreeIdx, S); return; } case Instruction::Load: { // Check that a vectorized load would load the same memory as a scalar - // load. - // For example we don't want vectorize loads that are smaller than 8 bit. - // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats - // loading/storing it as an i8 struct. If we vectorize loads/stores from - // such a struct we read/write packed bits disagreeing with the + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - Type *ScalarTy = VL[0]->getType(); + Type *ScalarTy = VL0->getType(); if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1313,15 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LoadInst *L = cast<LoadInst>(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } } // Check if the loads are consecutive, reversed, or neither. - // TODO: What we really want is to sort the loads, but for now, check - // the two likely directions. bool Consecutive = true; bool ReverseConsecutive = true; for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { @@ -1335,7 +1636,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1349,15 +1650,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, break; } - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); - if (ReverseConsecutive) { - ++NumLoadsWantToChangeOrder; DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); - } else { - DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + ++NumLoadsWantToChangeOrder; + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, S); + return; + } + + if (VL.size() > 2) { + bool ShuffledLoads = true; + SmallVector<Value *, 8> Sorted; + SmallVector<unsigned, 4> Mask; + if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) { + auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end()); + for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { + if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { + ShuffledLoads = false; + break; + } + } + // TODO: Tracking how many load wants to have arbitrary shuffled order + // would be usefull. + if (ShuffledLoads) { + DEBUG(dbgs() << "SLP: added a vector of loads which needs " + "permutation of loaded lanes.\n"); + newTreeEntry(NewVL, true, UserTreeIdx, S, + makeArrayRef(Mask.begin(), Mask.end()), OpdNum); + return; + } + } } + + DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, S); return; } case Instruction::ZExt: @@ -1377,12 +1704,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1391,7 +1718,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; } @@ -1399,19 +1726,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FCmp: { // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); - Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); + Type *ComparedTy = VL0->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast<CmpInst>(VL[i]); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1420,7 +1747,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; } @@ -1442,17 +1769,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { - newTreeEntry(VL, true, UserTreeIdx); + case Instruction::Xor: + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right); + reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(Right, Depth + 1, UserTreeIdx, 1); return; } @@ -1462,30 +1789,30 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; - } + case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (unsigned j = 0; j < VL.size(); ++j) { if (cast<Instruction>(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } // We can't combine several GEPs into one vector if they operate on // different types. - Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); + Type *Ty0 = VL0->getOperand(0)->getType(); for (unsigned j = 0; j < VL.size(); ++j) { Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1497,12 +1824,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1510,7 +1837,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; } @@ -1519,12 +1846,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1536,13 +1863,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::Call: { // Check if the calls are all to the same vectorizable intrinsic. - CallInst *CI = cast<CallInst>(VL[0]); + CallInst *CI = cast<CallInst>(VL0); // Check if this is an Intrinsic call or something that can be // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1556,7 +1883,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1567,7 +1894,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1580,14 +1907,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1595,28 +1922,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CallInst *CI2 = dyn_cast<CallInst>(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; } - case Instruction::ShuffleVector: { + case Instruction::ShuffleVector: // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. - if (!isAltShuffle) { + if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(VL, Left, Right); + reorderAltShuffleOperands(S.Opcode, VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(Right, Depth + 1, UserTreeIdx, 1); return; } @@ -1626,13 +1953,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); } return; - } + default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1663,19 +1990,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { return N; } -bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const { - assert(Opcode == Instruction::ExtractElement || - Opcode == Instruction::ExtractValue); - assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); +bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const { + Instruction *E0 = cast<Instruction>(OpValue); + assert(E0->getOpcode() == Instruction::ExtractElement || + E0->getOpcode() == Instruction::ExtractValue); + assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. - Value *VL0 = VL[0]; - Instruction *E0 = cast<Instruction>(VL0); Value *Vec = E0->getOperand(0); // We have to extract from a vector/aggregate with the same number of elements. unsigned NElts; - if (Opcode == Instruction::ExtractValue) { + if (E0->getOpcode() == Instruction::ExtractValue) { const DataLayout &DL = E0->getModule()->getDataLayout(); NElts = canMapToVector(Vec->getType(), DL); if (!NElts) @@ -1692,20 +2018,24 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const { return false; // Check that all of the indices extract from the correct offset. - if (!matchExtractIndex(E0, 0, Opcode)) - return false; - - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - Instruction *E = cast<Instruction>(VL[i]); - if (!matchExtractIndex(E, i, Opcode)) + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + Instruction *Inst = cast<Instruction>(VL[I]); + if (!matchExtractIndex(Inst, I, Inst->getOpcode())) return false; - if (E->getOperand(0) != Vec) + if (Inst->getOperand(0) != Vec) return false; } return true; } +bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { + return I->hasOneUse() || + std::all_of(I->user_begin(), I->user_end(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0; + }); +} + int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef<Value*> VL = E->Scalars; @@ -1728,28 +2058,47 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { if (isSplat(VL)) { return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } + if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) { + Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); + if (ShuffleKind.hasValue()) { + int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + for (auto *V : VL) { + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + if (areAllUsersVectorized(cast<Instruction>(V)) && + !ScalarToTreeEntry.count(V)) { + auto *IO = cast<ConstantInt>( + cast<ExtractElementInst>(V)->getIndexOperand()); + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + IO->getZExtValue()); + } + } + return Cost; + } + } return getGatherCost(E->Scalars); } - unsigned Opcode = getSameOpcode(VL); - assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast<Instruction>(VL[0]); - switch (Opcode) { - case Instruction::PHI: { + InstructionsState S = getSameOpcode(VL); + assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + Instruction *VL0 = cast<Instruction>(S.OpValue); + unsigned ShuffleOrOp = S.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : S.Opcode; + switch (ShuffleOrOp) { + case Instruction::PHI: return 0; - } + case Instruction::ExtractValue: - case Instruction::ExtractElement: { - if (canReuseExtract(VL, Opcode)) { + case Instruction::ExtractElement: + if (canReuseExtract(VL, S.OpValue)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast<Instruction>(VL[i]); // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. - if (E->hasOneUse() || - std::all_of(E->user_begin(), E->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; - })) + if (areAllUsersVectorized(E)) // Take credit for instruction that will become dead. DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); @@ -1757,7 +2106,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return -DeadCost; } return getGatherCost(VecTy); - } + case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -1786,8 +2135,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0); + TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); return VecCost - ScalarCost; } case Instruction::Add: @@ -1848,9 +2197,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, + int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); return VecCost - ScalarCost; } @@ -1968,7 +2317,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { } bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { - // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -2139,13 +2487,18 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, +void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *i : VL) { - Left.push_back(cast<Instruction>(i)->getOperand(0)); - Right.push_back(cast<Instruction>(i)->getOperand(1)); + unsigned AltOpcode = getAltOpcode(Opcode); + (void)AltOpcode; + for (Value *V : VL) { + auto *I = cast<Instruction>(V); + assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && + "Incorrect instruction in vector"); + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); } // Reorder if we have a commutative operation and consecutive access @@ -2190,14 +2543,12 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, Instruction &I, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, - bool AllSameOpcodeLeft, - bool AllSameOpcodeRight, bool SplatLeft, - bool SplatRight) { - Value *VLeft = I.getOperand(0); - Value *VRight = I.getOperand(1); +static bool shouldReorderOperands( + int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left, + ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, + bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2253,15 +2604,16 @@ static bool shouldReorderOperands(int i, Instruction &I, return false; } -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, +void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, + ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right) { - - if (VL.size()) { + if (!VL.empty()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto VLeft = cast<Instruction>(VL[0])->getOperand(0); - auto VRight = cast<Instruction>(VL[0])->getOperand(1); + auto *I = cast<Instruction>(VL[0]); + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2278,16 +2630,21 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast<Instruction>(VL[i]); - assert(I->isCommutative() && "Can only process commutative instruction"); + assert(((I->getOpcode() == Opcode && I->isCommutative()) || + (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && + "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight)) { - Left.push_back(I->getOperand(1)); - Right.push_back(I->getOperand(0)); + Value *VLeft; + Value *VRight; + if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, + VRight)) { + Left.push_back(VRight); + Right.push_back(VLeft); } else { - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + Left.push_back(VLeft); + Right.push_back(VRight); } // Update Splat* and AllSameOpcode* after the insertion. SplatRight = SplatRight && (Right[i - 1] == Right[i]); @@ -2340,14 +2697,17 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, } } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { - +void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast<Instruction>(VL.front()); + auto *Front = cast<Instruction>(OpValue); auto *BB = Front->getParent(); - assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { - return cast<Instruction>(V)->getParent() == BB; + const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode(); + const unsigned AltOpcode = getAltOpcode(Opcode); + assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { + return !sameOpcodeOrAlt(Opcode, AltOpcode, + cast<Instruction>(V)->getOpcode()) || + cast<Instruction>(V)->getParent() == BB; })); // The last instruction in the bundle in program order. @@ -2358,10 +2718,12 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back()); + auto *Bundle = + BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) - LastInst = Bundle->Inst; + if (Bundle->OpValue == Bundle->Inst) + LastInst = Bundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -2385,7 +2747,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { if (!LastInst) { SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I)) + if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode())) LastInst = &I; if (Bundle.empty()) break; @@ -2435,27 +2797,41 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const { return nullptr; } -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { - if (TreeEntry *E = getTreeEntry(VL[0])) - if (E->isSame(VL)) - return vectorizeTree(E); +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) { + InstructionsState S = getSameOpcode(VL); + if (S.Opcode) { + if (TreeEntry *E = getTreeEntry(S.OpValue)) { + TreeEntry *UserTreeEntry = nullptr; + if (UserIndx != -1) + UserTreeEntry = &VectorizableTree[UserIndx]; + + if (E->isSame(VL) || + (UserTreeEntry && + (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && + !UserTreeEntry->ShuffleMask[OpdNum].empty() && + E->isFoundJumbled(VL, *DL, *SE))) + return vectorizeTree(E, OpdNum, UserIndx); + } + } - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + Type *ScalarTy = S.OpValue->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); return Gather(VL, VecTy); } -Value *BoUpSLP::vectorizeTree(TreeEntry *E) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { IRBuilder<>::InsertPointGuard Guard(Builder); + TreeEntry *UserTreeEntry = nullptr; if (E->VectorizedValue) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } + InstructionsState S = getSameOpcode(E->Scalars); Instruction *VL0 = cast<Instruction>(E->Scalars[0]); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) @@ -2463,15 +2839,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); E->VectorizedValue = V; return V; } - unsigned Opcode = getSameOpcode(E->Scalars); + assert(ScalarToTreeEntry.count(E->Scalars[0]) && + "Expected user tree entry, missing!"); + int CurrIndx = ScalarToTreeEntry[E->Scalars[0]]; - switch (Opcode) { + unsigned ShuffleOrOp = S.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : S.Opcode; + switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); @@ -2498,7 +2878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(Operands, i, CurrIndx); NewPhi->addIncoming(Vec, IBB); } @@ -2508,18 +2888,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractElement: { - if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) { + if (canReuseExtract(E->Scalars, VL0)) { Value *V = VL0->getOperand(0); E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) { + if (canReuseExtract(E->Scalars, VL0)) { LoadInst *LI = cast<LoadInst>(VL0->getOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -2528,7 +2908,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return propagateMetadata(V, E->Scalars); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); E->VectorizedValue = V; return V; @@ -2549,9 +2929,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (Value *V : E->Scalars) INVL.push_back(cast<Instruction>(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(INVL, 0, CurrIndx); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -2570,23 +2950,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { RHSV.push_back(cast<Instruction>(V)->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(LHSV, 0, CurrIndx); + Value *R = vectorizeTree(RHSV, 1, CurrIndx); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; - if (Opcode == Instruction::FCmp) + if (S.Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars); + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -2598,11 +2978,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Cond = vectorizeTree(CondVec, 0, CurrIndx); + Value *True = vectorizeTree(TrueVec, 1, CurrIndx); + Value *False = vectorizeTree(FalseVec, 2, CurrIndx); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -2632,25 +3012,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, + RHSVL); else for (Value *V : E->Scalars) { - LHSVL.push_back(cast<Instruction>(V)->getOperand(0)); - RHSVL.push_back(cast<Instruction>(V)->getOperand(1)); + auto *I = cast<Instruction>(V); + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); + Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; - BinaryOperator *BinOp = cast<BinaryOperator>(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + Value *V = Builder.CreateBinOp( + static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars); + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; if (Instruction *I = dyn_cast<Instruction>(V)) @@ -2661,9 +3043,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); + + if (UserIndx != -1) + UserTreeEntry = &VectorizableTree[UserIndx]; + + bool isJumbled = false; + LoadInst *LI = NULL; + if (UserTreeEntry && + (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && + !UserTreeEntry->ShuffleMask[OpdNum].empty()) { + isJumbled = true; + LI = cast<LoadInst>(E->Scalars[0]); + } else { + LI = cast<LoadInst>(VL0); + } - LoadInst *LI = cast<LoadInst>(VL0); Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); @@ -2685,47 +3080,60 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; - return propagateMetadata(LI, E->Scalars); + propagateMetadata(LI, E->Scalars); + + if (isJumbled) { + SmallVector<Constant *, 8> Mask; + for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum]) + Mask.push_back(Builder.getInt32(LaneEntry)); + // Generate shuffle for jumbled memory access + Value *Undef = UndefValue::get(VecTy); + Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, + ConstantVector::get(Mask)); + E->VectorizedValue = Shuf; + ++NumVectorInstructions; + return Shuf; + } + return LI; } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); - ValueList ValueOp; + ValueList ScalarStoreValues; for (Value *V : E->Scalars) - ValueOp.push_back(cast<StoreInst>(V)->getValueOperand()); + ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); - Value *VecValue = vectorizeTree(ValueOp); - Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), - VecTy->getPointerTo(AS)); + Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx); + Value *ScalarPtr = SI->getPointerOperand(); + Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); - // The pointer operand uses an in-tree scalar so we add the new BitCast to - // ExternalUses list to make sure that an extract will be generated in the + // The pointer operand uses an in-tree scalar, so add the new BitCast to + // ExternalUses to make sure that an extract will be generated in the // future. - Value *PO = SI->getPointerOperand(); - if (getTreeEntry(PO)) - ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); + if (getTreeEntry(ScalarPtr)) + ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); - if (!Alignment) { + if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - } + S->setAlignment(Alignment); E->VectorizedValue = S; ++NumVectorInstructions; return propagateMetadata(S, E->Scalars); } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); ValueList Op0VL; for (Value *V : E->Scalars) Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx); std::vector<Value *> OpVecs; for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; @@ -2734,7 +3142,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (Value *V : E->Scalars) OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(OpVL, j, CurrIndx); OpVecs.push_back(OpVec); } @@ -2750,7 +3158,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -2763,7 +3171,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // ctlz,cttz and powi are special intrinsics whose second argument is // a scalar. This argument should not be vectorized. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { - CallInst *CEI = cast<CallInst>(E->Scalars[0]); + CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); continue; @@ -2773,7 +3181,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVL.push_back(CEI->getArgOperand(j)); } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(OpVL, j, CurrIndx); DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -2793,30 +3201,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars); + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars); + assert(Instruction::isBinaryOp(S.Opcode) && + "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars, VL0); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); + Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; // Create a vector of LHS op1 RHS - BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); - Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + Value *V0 = Builder.CreateBinOp( + static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); + unsigned AltOpcode = getAltOpcode(S.Opcode); // Create a vector of LHS op2 RHS - Instruction *VL1 = cast<Instruction>(E->Scalars[1]); - BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); - Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + Value *V1 = Builder.CreateBinOp( + static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS); // Create shuffle to take alternate operations from the vector. // Also, gather up odd and even scalar ops to propagate IR flags to @@ -2859,7 +3268,6 @@ Value *BoUpSLP::vectorizeTree() { Value * BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { - // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); @@ -2905,9 +3313,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert((!E->NeedToGather) && "Extracting from a gather list"); - Value *Vec = E->VectorizedValue; + Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue); + if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) { + Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0); + } else { + Vec = E->VectorizedValue; + } assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); @@ -2975,14 +3388,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (TreeEntry &EIdx : VectorizableTree) { TreeEntry *Entry = &EIdx; + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + assert(Entry->VectorizedValue && "Can't find vectorizable value"); + // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. - if (Entry->NeedToGather) - continue; - - assert(Entry->VectorizedValue && "Can't find vectorizable value"); Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -2990,9 +3404,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - assert((getTreeEntry(U) || - // It is legal to replace users in the ignorelist by undef. - is_contained(UserIgnoreList, U)) && + // It is legal to replace users in the ignorelist by undef. + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); } #endif @@ -3009,7 +3422,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { return VectorizableTree[0].VectorizedValue; } -void BoUpSLP::optimizeGatherSequence() { +void BoUpSLP::optimizeGatherSequence(Function &F) { DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. @@ -3043,30 +3456,16 @@ void BoUpSLP::optimizeGatherSequence() { Insert->moveBefore(PreHeader->getTerminator()); } - // Make a list of all reachable blocks in our CSE queue. - SmallVector<const DomTreeNode *, 8> CSEWorkList; - CSEWorkList.reserve(CSEBlocks.size()); - for (BasicBlock *BB : CSEBlocks) - if (DomTreeNode *N = DT->getNode(BB)) { - assert(DT->isReachableFromEntry(N)); - CSEWorkList.push_back(N); - } - - // Sort blocks by domination. This ensures we visit a block after all blocks - // dominating it are visited. - std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), - [this](const DomTreeNode *A, const DomTreeNode *B) { - return DT->properlyDominates(A, B); - }); - // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; - for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { - assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && - "Worklist not sorted properly!"); - BasicBlock *BB = (*I)->getBlock(); + ReversePostOrderTraversal<Function *> RPOT(&F); + for (auto BB : RPOT) { + // Traverse CSEBlocks by RPOT order. + if (!CSEBlocks.count(BB)) + continue; + // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = &*it++; @@ -3111,7 +3510,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V)) + if (!extendSchedulingRegion(V, OpValue)) return false; } @@ -3148,8 +3547,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - SD->clearDependencies(); + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); } ReSchedule = true; } @@ -3210,17 +3610,43 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, } } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { - if (getScheduleData(V)) +BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { + // Allocate a new ScheduleData for the instruction. + if (ChunkPos >= ChunkSize) { + ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize)); + ChunkPos = 0; + } + return &(ScheduleDataChunks.back()[ChunkPos++]); +} + +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, + Value *OpValue) { + if (getScheduleData(V, isOneOf(OpValue, V))) return true; Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); + auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool { + ScheduleData *ISD = getScheduleData(I); + if (!ISD) + return false; + assert(isInSchedulingRegion(ISD) && + "ScheduleData not in scheduling region"); + ScheduleData *SD = allocateScheduleDataChunks(); + SD->Inst = I; + SD->init(SchedulingRegionID, OpValue); + ExtraScheduleDataMap[I][OpValue] = SD; + return true; + }; + if (CheckSheduleForI(I)) + return true; if (!ScheduleStart) { // It's the first instruction in the new region. initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); + if (isOneOf(OpValue, I) != I) + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; @@ -3232,7 +3658,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); BasicBlock::iterator LowerEnd = BB->end(); - for (;;) { + while (true) { if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); return false; @@ -3242,6 +3668,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; + if (isOneOf(OpValue, I) != I) + CheckSheduleForI(I); DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; } @@ -3252,6 +3680,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); + if (isOneOf(OpValue, I) != I) + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; @@ -3272,21 +3702,17 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { ScheduleData *SD = ScheduleDataMap[I]; if (!SD) { - // Allocate a new ScheduleData for the instruction. - if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back( - llvm::make_unique<ScheduleData[]>(ChunkSize)); - ChunkPos = 0; - } - SD = &(ScheduleDataChunks.back()[ChunkPos++]); + SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); - SD->init(SchedulingRegionID); + SD->init(SchedulingRegionID, I); - if (I->mayReadOrWriteMemory()) { + if (I->mayReadOrWriteMemory() && + (!isa<IntrinsicInst>(I) || + cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) { // Update the linked list of memory accessing instructions. if (CurrentLoadStore) { CurrentLoadStore->NextLoadStore = SD; @@ -3326,23 +3752,35 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. - for (User *U : BundleMember->Inst->users()) { - if (isa<Instruction>(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + if (BundleMember->OpValue != BundleMember->Inst) { + ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + for (User *U : BundleMember->Inst->users()) { + if (isa<Instruction>(U)) { + ScheduleData *UseSD = getScheduleData(U); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); + BundleMember->incrementUnscheduledDeps(1); } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. - BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); } } @@ -3419,16 +3857,17 @@ void BoUpSLP::BlockScheduling::resetSchedule() { assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - assert(isInSchedulingRegion(SD)); - SD->IsScheduled = false; - SD->resetUnscheduledDeps(); + doForAllOpcodes(I, [&](ScheduleData *SD) { + assert(isInSchedulingRegion(SD) && + "ScheduleData not in scheduling region"); + SD->IsScheduled = false; + SD->resetUnscheduledDeps(); + }); } ReadyInsts.clear(); } void BoUpSLP::scheduleBlock(BlockScheduling *BS) { - if (!BS->ScheduleStart) return; @@ -3452,15 +3891,16 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = BS->getScheduleData(I); - assert( - SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && - "scheduler and vectorizer have different opinion on what is a bundle"); - SD->FirstInBundle->SchedulingPriority = Idx++; - if (SD->isSchedulingEntity()) { - BS->calculateDependencies(SD, false, this); - NumToSchedule++; - } + BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + assert(SD->isPartOfBundle() == + (getTreeEntry(SD->Inst) != nullptr) && + "scheduler and vectorizer bundle mismatch"); + SD->FirstInBundle->SchedulingPriority = Idx++; + if (SD->isSchedulingEntity()) { + BS->calculateDependencies(SD, false, this); + NumToSchedule++; + } + }); } BS->initialFillReadyList(ReadyInsts); @@ -3559,7 +3999,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, SmallVectorImpl<Value *> &ToDemote, SmallVectorImpl<Value *> &Roots) { - // We can always demote constants. if (isa<Constant>(V)) { ToDemote.push_back(V); @@ -3702,7 +4141,7 @@ void BoUpSLP::computeMinimumValueSizes() { // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. - IsKnownPositive = all_of(TreeRoot, [&](Value *R) { + IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { KnownBits Known = computeKnownBits(R, *DL); return Known.isNonNegative(); }); @@ -3710,7 +4149,7 @@ void BoUpSLP::computeMinimumValueSizes() { // Determine the maximum number of bits required to store the scalar // values. for (auto *Scalar : ToDemote) { - auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT); + auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT); auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); } @@ -3755,6 +4194,7 @@ void BoUpSLP::computeMinimumValueSizes() { } namespace { + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { SLPVectorizerPass Impl; @@ -3766,7 +4206,6 @@ struct SLPVectorizer : public FunctionPass { initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); } - bool doInitialization(Module &M) override { return false; } @@ -3806,6 +4245,7 @@ struct SLPVectorizer : public FunctionPass { AU.setPreservesCFG(); } }; + } // end anonymous namespace PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { @@ -3893,7 +4333,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, } if (Changed) { - R.optimizeGatherSequence(); + R.optimizeGatherSequence(F); DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); } @@ -3952,7 +4392,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + using namespace ore; + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", cast<StoreInst>(Chain[i])) << "Stores SLP vectorized with cost " << NV("Cost", Cost) @@ -3972,7 +4414,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP &R) { - SetVector<StoreInst *> Heads, Tails; + SetVector<StoreInst *> Heads; + SmallDenseSet<StoreInst *> Tails; SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the @@ -3980,45 +4423,51 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - // Do a quadratic search on all of the given stores and find + // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. SmallVector<unsigned, 16> IndexQueue; - for (unsigned i = 0, e = Stores.size(); i < e; ++i) { - IndexQueue.clear(); + unsigned E = Stores.size(); + IndexQueue.resize(E - 1); + for (unsigned I = E; I > 0; --I) { + unsigned Idx = I - 1; // If a store has multiple consecutive store candidates, search Stores - // array according to the sequence: from i+1 to e, then from i-1 to 0. + // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... // This is because usually pairing with immediate succeeding or preceding // candidate create the best chance to find slp vectorization opportunity. - unsigned j = 0; - for (j = i + 1; j < e; ++j) - IndexQueue.push_back(j); - for (j = i; j > 0; --j) - IndexQueue.push_back(j - 1); - - for (auto &k : IndexQueue) { - if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) { - Tails.insert(Stores[k]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[k]; + unsigned Offset = 1; + unsigned Cnt = 0; + for (unsigned J = 0; J < E - 1; ++J, ++Offset) { + if (Idx >= Offset) { + IndexQueue[Cnt] = Idx - Offset; + ++Cnt; + } + if (Idx + Offset < E) { + IndexQueue[Cnt] = Idx + Offset; + ++Cnt; + } + } + + for (auto K : IndexQueue) { + if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) { + Tails.insert(Stores[Idx]); + Heads.insert(Stores[K]); + ConsecutiveChain[Stores[K]] = Stores[Idx]; break; } } } // For stores that start but don't end a link in the chain: - for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) + for (auto *SI : llvm::reverse(Heads)) { + if (Tails.count(SI)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - StoreInst *I = *it; + StoreInst *I = SI; // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; + while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) { Operands.push_back(I); // Move to the next value in the chain. I = ConsecutiveChain[I]; @@ -4041,7 +4490,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, } void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { - // Initialize the collections. We will make a single pass over the block. Stores.clear(); GEPs.clear(); @@ -4050,7 +4498,6 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { // Stores and GEPs according to the underlying objects of their pointer // operands. for (Instruction &I : *BB) { - // Ignore store instructions that are volatile or have a pointer operand // that doesn't point to a scalar type. if (auto *SI = dyn_cast<StoreInst>(&I)) { @@ -4086,7 +4533,8 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, ArrayRef<Value *> BuildVector, - bool AllowReorder) { + bool AllowReorder, + bool NeedExtraction) { if (VL.size() < 2) return false; @@ -4103,19 +4551,51 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); - if (MaxVF < 2) - return false; + if (MaxVF < 2) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "SmallVF", I0) + << "Cannot SLP vectorize list: vectorization factor " + << "less than 2 is not supported"; + }); + return false; + } for (Value *V : VL) { Type *Ty = V->getType(); - if (!isValidElementType(Ty)) + if (!isValidElementType(Ty)) { + // NOTE: the following will give user internal llvm type name, which may not be useful + R.getORE()->emit([&]() { + std::string type_str; + llvm::raw_string_ostream rso(type_str); + Ty->print(rso); + return OptimizationRemarkMissed( + SV_NAME, "UnsupportedType", I0) + << "Cannot SLP vectorize list: type " + << rso.str() + " is unsupported by vectorizer"; + }); return false; + } Instruction *Inst = dyn_cast<Instruction>(V); - if (!Inst || Inst->getOpcode() != Opcode0) + + if (!Inst) + return false; + if (Inst->getOpcode() != Opcode0) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "InequableTypes", I0) + << "Cannot SLP vectorize list: not all of the " + << "parts of scalar instructions are of the same type: " + << ore::NV("Instruction1Opcode", I0) << " and " + << ore::NV("Instruction2Opcode", Inst); + }); return false; + } } bool Changed = false; + bool CandidateFound = false; + int MinCost = SLPCostThreshold; // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end()); @@ -4148,11 +4628,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, << "\n"); ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); + ArrayRef<Value *> EmptyArray; ArrayRef<Value *> BuildVectorSlice; if (!BuildVector.empty()) BuildVectorSlice = BuildVector.slice(I, OpsWidth); - R.buildTree(Ops, BuildVectorSlice); + R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice); // TODO: check if we can allow reordering for more cases. if (AllowReorder && R.shouldReorder()) { // Conceptually, there is nothing actually preventing us from trying to @@ -4169,14 +4650,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); + CandidateFound = true; + MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", - cast<Instruction>(Ops[0])) - << "SLP vectorized with cost " << ore::NV("Cost", Cost) - << " and with tree size " - << ore::NV("TreeSize", R.getTreeSize())); + cast<Instruction>(Ops[0])) + << "SLP vectorized with cost " << ore::NV("Cost", Cost) + << " and with tree size " + << ore::NV("TreeSize", R.getTreeSize())); Value *VectorizedRoot = R.vectorizeTree(); @@ -4199,8 +4682,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, cast<Instruction>(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); I->setOperand(1, Extract); - I->removeFromParent(); - I->insertAfter(Extract); + I->moveAfter(Extract); InsertAfter = I; } } @@ -4212,18 +4694,37 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, } } + if (!Changed && CandidateFound) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "NotBeneficial", I0) + << "List vectorization was possible but not beneficial with cost " + << ore::NV("Cost", MinCost) << " >= " + << ore::NV("Treshold", -SLPCostThreshold); + }); + } else if (!Changed) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "NotPossible", I0) + << "Cannot SLP vectorize list: vectorization was impossible" + << " with available vectorization factors"; + }); + } return Changed; } -bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { - if (!V) +bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { + if (!I) return false; - Value *P = V->getParent(); + if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) + return false; + + Value *P = I->getParent(); // Vectorize in current basic block only. - auto *Op0 = dyn_cast<Instruction>(V->getOperand(0)); - auto *Op1 = dyn_cast<Instruction>(V->getOperand(1)); + auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); + auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) return false; @@ -4286,6 +4787,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, } namespace { + /// Model horizontal reductions. /// /// A horizontal reduction is a tree of reduction operations (currently add and @@ -4314,17 +4816,375 @@ namespace { /// *p = /// class HorizontalReduction { - SmallVector<Value *, 16> ReductionOps; + using ReductionOpsType = SmallVector<Value *, 16>; + using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; + ReductionOpsListType ReductionOps; SmallVector<Value *, 32> ReducedVals; // Use map vector to make stable output. MapVector<Instruction *, Value *> ExtraArgs; - BinaryOperator *ReductionRoot = nullptr; + /// Kind of the reduction data. + enum ReductionKind { + RK_None, /// Not a reduction. + RK_Arithmetic, /// Binary reduction data. + RK_Min, /// Minimum reduction data. + RK_UMin, /// Unsigned minimum reduction data. + RK_Max, /// Maximum reduction data. + RK_UMax, /// Unsigned maximum reduction data. + }; + + /// Contains info about operation, like its opcode, left and right operands. + class OperationData { + /// Opcode of the instruction. + unsigned Opcode = 0; + + /// Left operand of the reduction operation. + Value *LHS = nullptr; + + /// Right operand of the reduction operation. + Value *RHS = nullptr; + + /// Kind of the reduction operation. + ReductionKind Kind = RK_None; + + /// True if float point min/max reduction has no NaNs. + bool NoNaN = false; + + /// Checks if the reduction operation can be vectorized. + bool isVectorizable() const { + return LHS && RHS && + // We currently only support adds && min/max reductions. + ((Kind == RK_Arithmetic && + (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) || + ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && + (Kind == RK_Min || Kind == RK_Max)) || + (Opcode == Instruction::ICmp && + (Kind == RK_UMin || Kind == RK_UMax))); + } + + /// Creates reduction operation with the current opcode. + Value *createOp(IRBuilder<> &Builder, const Twine &Name) const { + assert(isVectorizable() && + "Expected add|fadd or min/max reduction operation."); + Value *Cmp; + switch (Kind) { + case RK_Arithmetic: + return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, + Name); + case RK_Min: + Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) + : Builder.CreateFCmpOLT(LHS, RHS); + break; + case RK_Max: + Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) + : Builder.CreateFCmpOGT(LHS, RHS); + break; + case RK_UMin: + assert(Opcode == Instruction::ICmp && "Expected integer types."); + Cmp = Builder.CreateICmpULT(LHS, RHS); + break; + case RK_UMax: + assert(Opcode == Instruction::ICmp && "Expected integer types."); + Cmp = Builder.CreateICmpUGT(LHS, RHS); + break; + case RK_None: + llvm_unreachable("Unknown reduction operation."); + } + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + + public: + explicit OperationData() = default; + + /// Construction for reduced values. They are identified by opcode only and + /// don't have associated LHS/RHS values. + explicit OperationData(Value *V) { + if (auto *I = dyn_cast<Instruction>(V)) + Opcode = I->getOpcode(); + } + + /// Constructor for reduction operations with opcode and its left and + /// right operands. + OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind, + bool NoNaN = false) + : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) { + assert(Kind != RK_None && "One of the reduction operations is expected."); + } + + explicit operator bool() const { return Opcode; } + + /// Get the index of the first operand. + unsigned getFirstOperandIndex() const { + assert(!!*this && "The opcode is not set."); + switch (Kind) { + case RK_Min: + case RK_UMin: + case RK_Max: + case RK_UMax: + return 1; + case RK_Arithmetic: + case RK_None: + break; + } + return 0; + } + + /// Total number of operands in the reduction operation. + unsigned getNumberOfOperands() const { + assert(Kind != RK_None && !!*this && LHS && RHS && + "Expected reduction operation."); + switch (Kind) { + case RK_Arithmetic: + return 2; + case RK_Min: + case RK_UMin: + case RK_Max: + case RK_UMax: + return 3; + case RK_None: + break; + } + llvm_unreachable("Reduction kind is not set"); + } + + /// Checks if the operation has the same parent as \p P. + bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const { + assert(Kind != RK_None && !!*this && LHS && RHS && + "Expected reduction operation."); + if (!IsRedOp) + return I->getParent() == P; + switch (Kind) { + case RK_Arithmetic: + // Arithmetic reduction operation must be used once only. + return I->getParent() == P; + case RK_Min: + case RK_UMin: + case RK_Max: + case RK_UMax: { + // SelectInst must be used twice while the condition op must have single + // use only. + auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); + return I->getParent() == P && Cmp && Cmp->getParent() == P; + } + case RK_None: + break; + } + llvm_unreachable("Reduction kind is not set"); + } + /// Expected number of uses for reduction operations/reduced values. + bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const { + assert(Kind != RK_None && !!*this && LHS && RHS && + "Expected reduction operation."); + switch (Kind) { + case RK_Arithmetic: + return I->hasOneUse(); + case RK_Min: + case RK_UMin: + case RK_Max: + case RK_UMax: + return I->hasNUses(2) && + (!IsReductionOp || + cast<SelectInst>(I)->getCondition()->hasOneUse()); + case RK_None: + break; + } + llvm_unreachable("Reduction kind is not set"); + } + + /// Initializes the list of reduction operations. + void initReductionOps(ReductionOpsListType &ReductionOps) { + assert(Kind != RK_None && !!*this && LHS && RHS && + "Expected reduction operation."); + switch (Kind) { + case RK_Arithmetic: + ReductionOps.assign(1, ReductionOpsType()); + break; + case RK_Min: + case RK_UMin: + case RK_Max: + case RK_UMax: + ReductionOps.assign(2, ReductionOpsType()); + break; + case RK_None: + llvm_unreachable("Reduction kind is not set"); + } + } + /// Add all reduction operations for the reduction instruction \p I. + void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) { + assert(Kind != RK_None && !!*this && LHS && RHS && + "Expected reduction operation."); + switch (Kind) { + case RK_Arithmetic: + ReductionOps[0].emplace_back(I); + break; + case RK_Min: + case RK_UMin: + case RK_Max: + case RK_UMax: + ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); + ReductionOps[1].emplace_back(I); + break; + case RK_None: + llvm_unreachable("Reduction kind is not set"); + } + } + + /// Checks if instruction is associative and can be vectorized. + bool isAssociative(Instruction *I) const { + assert(Kind != RK_None && *this && LHS && RHS && + "Expected reduction operation."); + switch (Kind) { + case RK_Arithmetic: + return I->isAssociative(); + case RK_Min: + case RK_Max: + return Opcode == Instruction::ICmp || + cast<Instruction>(I->getOperand(0))->isFast(); + case RK_UMin: + case RK_UMax: + assert(Opcode == Instruction::ICmp && + "Only integer compare operation is expected."); + return true; + case RK_None: + break; + } + llvm_unreachable("Reduction kind is not set"); + } + + /// Checks if the reduction operation can be vectorized. + bool isVectorizable(Instruction *I) const { + return isVectorizable() && isAssociative(I); + } + + /// Checks if two operation data are both a reduction op or both a reduced + /// value. + bool operator==(const OperationData &OD) { + assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && + "One of the comparing operations is incorrect."); + return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); + } + bool operator!=(const OperationData &OD) { return !(*this == OD); } + void clear() { + Opcode = 0; + LHS = nullptr; + RHS = nullptr; + Kind = RK_None; + NoNaN = false; + } + + /// Get the opcode of the reduction operation. + unsigned getOpcode() const { + assert(isVectorizable() && "Expected vectorizable operation."); + return Opcode; + } + + /// Get kind of reduction data. + ReductionKind getKind() const { return Kind; } + Value *getLHS() const { return LHS; } + Value *getRHS() const { return RHS; } + Type *getConditionType() const { + switch (Kind) { + case RK_Arithmetic: + return nullptr; + case RK_Min: + case RK_Max: + case RK_UMin: + case RK_UMax: + return CmpInst::makeCmpResultType(LHS->getType()); + case RK_None: + break; + } + llvm_unreachable("Reduction kind is not set"); + } + + /// Creates reduction operation with the current opcode with the IR flags + /// from \p ReductionOps. + Value *createOp(IRBuilder<> &Builder, const Twine &Name, + const ReductionOpsListType &ReductionOps) const { + assert(isVectorizable() && + "Expected add|fadd or min/max reduction operation."); + auto *Op = createOp(Builder, Name); + switch (Kind) { + case RK_Arithmetic: + propagateIRFlags(Op, ReductionOps[0]); + return Op; + case RK_Min: + case RK_Max: + case RK_UMin: + case RK_UMax: + if (auto *SI = dyn_cast<SelectInst>(Op)) + propagateIRFlags(SI->getCondition(), ReductionOps[0]); + propagateIRFlags(Op, ReductionOps[1]); + return Op; + case RK_None: + break; + } + llvm_unreachable("Unknown reduction operation."); + } + /// Creates reduction operation with the current opcode with the IR flags + /// from \p I. + Value *createOp(IRBuilder<> &Builder, const Twine &Name, + Instruction *I) const { + assert(isVectorizable() && + "Expected add|fadd or min/max reduction operation."); + auto *Op = createOp(Builder, Name); + switch (Kind) { + case RK_Arithmetic: + propagateIRFlags(Op, I); + return Op; + case RK_Min: + case RK_Max: + case RK_UMin: + case RK_UMax: + if (auto *SI = dyn_cast<SelectInst>(Op)) { + propagateIRFlags(SI->getCondition(), + cast<SelectInst>(I)->getCondition()); + } + propagateIRFlags(Op, I); + return Op; + case RK_None: + break; + } + llvm_unreachable("Unknown reduction operation."); + } + + TargetTransformInfo::ReductionFlags getFlags() const { + TargetTransformInfo::ReductionFlags Flags; + Flags.NoNaN = NoNaN; + switch (Kind) { + case RK_Arithmetic: + break; + case RK_Min: + Flags.IsSigned = Opcode == Instruction::ICmp; + Flags.IsMaxOp = false; + break; + case RK_Max: + Flags.IsSigned = Opcode == Instruction::ICmp; + Flags.IsMaxOp = true; + break; + case RK_UMin: + Flags.IsSigned = false; + Flags.IsMaxOp = false; + break; + case RK_UMax: + Flags.IsSigned = false; + Flags.IsMaxOp = true; + break; + case RK_None: + llvm_unreachable("Reduction kind is not set"); + } + return Flags; + } + }; + + Instruction *ReductionRoot = nullptr; + + /// The operation data of the reduction operation. + OperationData ReductionData; + + /// The operation data of the values we perform a reduction on. + OperationData ReducedValueData; - /// The opcode of the reduction. - Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd; - /// The opcode of the values we perform a reduction on. - unsigned ReducedValueOpcode = 0; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. bool IsPairwiseReduction = false; @@ -4349,55 +5209,89 @@ class HorizontalReduction { } } + static OperationData getOperationData(Value *V) { + if (!V) + return OperationData(); + + Value *LHS; + Value *RHS; + if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) { + return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS, + RK_Arithmetic); + } + if (auto *Select = dyn_cast<SelectInst>(V)) { + // Look for a min/max pattern. + if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); + } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); + } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) || + m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData( + Instruction::FCmp, LHS, RHS, RK_Min, + cast<Instruction>(Select->getCondition())->hasNoNaNs()); + } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); + } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); + } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) || + m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) { + return OperationData( + Instruction::FCmp, LHS, RHS, RK_Max, + cast<Instruction>(Select->getCondition())->hasNoNaNs()); + } + } + return OperationData(V); + } + public: HorizontalReduction() = default; /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { + bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { assert((!Phi || is_contained(Phi->operands(), B)) && "Thi phi needs to use the binary operator"); + ReductionData = getOperationData(B); + // We could have a initial reductions that is not an add. // r *= v1 + v2 + v3 + v4 // In such a case start looking for a tree rooted in the first '+'. if (Phi) { - if (B->getOperand(0) == Phi) { + if (ReductionData.getLHS() == Phi) { Phi = nullptr; - B = dyn_cast<BinaryOperator>(B->getOperand(1)); - } else if (B->getOperand(1) == Phi) { + B = dyn_cast<Instruction>(ReductionData.getRHS()); + ReductionData = getOperationData(B); + } else if (ReductionData.getRHS() == Phi) { Phi = nullptr; - B = dyn_cast<BinaryOperator>(B->getOperand(0)); + B = dyn_cast<Instruction>(ReductionData.getLHS()); + ReductionData = getOperationData(B); } } - if (!B) + if (!ReductionData.isVectorizable(B)) return false; Type *Ty = B->getType(); if (!isValidElementType(Ty)) return false; - ReductionOpcode = B->getOpcode(); - ReducedValueOpcode = 0; + ReducedValueData.clear(); ReductionRoot = B; - // We currently only support adds. - if ((ReductionOpcode != Instruction::Add && - ReductionOpcode != Instruction::FAdd) || - !B->isAssociative()) - return false; - // Post order traverse the reduction tree starting at B. We only handle true - // trees containing only binary operators or selects. + // trees containing only binary operators. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; - Stack.push_back(std::make_pair(B, 0)); + Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex())); + ReductionData.initReductionOps(ReductionOps); while (!Stack.empty()) { Instruction *TreeN = Stack.back().first; unsigned EdgeToVist = Stack.back().second++; - bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; + OperationData OpData = getOperationData(TreeN); + bool IsReducedValue = OpData != ReductionData; // Postorder vist. - if (EdgeToVist == 2 || IsReducedValue) { + if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) { if (IsReducedValue) ReducedVals.push_back(TreeN); else { @@ -4415,7 +5309,7 @@ public: markExtraArg(Stack[Stack.size() - 2], TreeN); ExtraArgs.erase(TreeN); } else - ReductionOps.push_back(TreeN); + ReductionData.addReductionOps(TreeN, ReductionOps); } // Retract. Stack.pop_back(); @@ -4426,45 +5320,50 @@ public: Value *NextV = TreeN->getOperand(EdgeToVist); if (NextV != Phi) { auto *I = dyn_cast<Instruction>(NextV); + OpData = getOperationData(I); // Continue analysis if the next operand is a reduction operation or // (possibly) a reduced value. If the reduced value opcode is not set, // the first met operation != reduction operation is considered as the // reduced value class. - if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || - I->getOpcode() == ReductionOpcode)) { + if (I && (!ReducedValueData || OpData == ReducedValueData || + OpData == ReductionData)) { + const bool IsReductionOperation = OpData == ReductionData; // Only handle trees in the current basic block. - if (I->getParent() != B->getParent()) { + if (!ReductionData.hasSameParent(I, B->getParent(), + IsReductionOperation)) { // I is an extra argument for TreeN (its parent operation). markExtraArg(Stack.back(), I); continue; } - // Each tree node needs to have one user except for the ultimate - // reduction. - if (!I->hasOneUse() && I != B) { + // Each tree node needs to have minimal number of users except for the + // ultimate reduction. + if (!ReductionData.hasRequiredNumberOfUses(I, + OpData == ReductionData) && + I != B) { // I is an extra argument for TreeN (its parent operation). markExtraArg(Stack.back(), I); continue; } - if (I->getOpcode() == ReductionOpcode) { + if (IsReductionOperation) { // We need to be able to reassociate the reduction operations. - if (!I->isAssociative()) { + if (!OpData.isAssociative(I)) { // I is an extra argument for TreeN (its parent operation). markExtraArg(Stack.back(), I); continue; } - } else if (ReducedValueOpcode && - ReducedValueOpcode != I->getOpcode()) { + } else if (ReducedValueData && + ReducedValueData != OpData) { // Make sure that the opcodes of the operations that we are going to // reduce match. // I is an extra argument for TreeN (its parent operation). markExtraArg(Stack.back(), I); continue; - } else if (!ReducedValueOpcode) - ReducedValueOpcode = I->getOpcode(); + } else if (!ReducedValueData) + ReducedValueData = OpData; - Stack.push_back(std::make_pair(I, 0)); + Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex())); continue; } } @@ -4492,7 +5391,7 @@ public: Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; - Unsafe.setUnsafeAlgebra(); + Unsafe.setFast(); Builder.setFastMathFlags(Unsafe); unsigned i = 0; @@ -4501,12 +5400,15 @@ public: // to use it. for (auto &Pair : ExtraArgs) ExternallyUsedValues[Pair.second].push_back(Pair.first); + SmallVector<Value *, 16> IgnoreList; + for (auto &V : ReductionOps) + IgnoreList.append(V.begin(), V.end()); while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ExternallyUsedValues, ReductionOps); + V.buildTree(VL, ExternallyUsedValues, IgnoreList); if (V.shouldReorder()) { SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); - V.buildTree(Reversed, ExternallyUsedValues, ReductionOps); + V.buildTree(Reversed, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) break; @@ -4516,17 +5418,27 @@ public: // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth); - if (Cost >= -SLPCostThreshold) - break; + if (Cost >= -SLPCostThreshold) { + V.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0])) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " + << ore::NV("Cost", Cost) << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + break; + } DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); - auto *I0 = cast<Instruction>(VL[0]); - V.getORE()->emit( - OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) + V.getORE()->emit([&]() { + return OptimizationRemark( + SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0])) << "Vectorized horizontal reduction with cost " << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize())); + << ore::NV("TreeSize", V.getTreeSize()); + }); // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); @@ -4534,12 +5446,14 @@ public: // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI); + emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, - ReducedSubTree, "bin.rdx"); - propagateIRFlags(VectorizedTree, ReductionOps); + OperationData VectReductionData(ReductionData.getOpcode(), + VectorizedTree, ReducedSubTree, + ReductionData.getKind()); + VectorizedTree = + VectReductionData.createOp(Builder, "op.rdx", ReductionOps); } else VectorizedTree = ReducedSubTree; i += ReduxWidth; @@ -4551,9 +5465,10 @@ public: for (; i < NumReducedVals; ++i) { auto *I = cast<Instruction>(ReducedVals[i]); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = - Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); - propagateIRFlags(VectorizedTree, ReductionOps); + OperationData VectReductionData(ReductionData.getOpcode(), + VectorizedTree, I, + ReductionData.getKind()); + VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps); } for (auto &Pair : ExternallyUsedValues) { assert(!Pair.second.empty() && @@ -4561,9 +5476,10 @@ public: // Add each externally used value to the final reduction. for (auto *I : Pair.second) { Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, - Pair.first, "bin.extra"); - propagateIRFlags(VectorizedTree, I); + OperationData VectReductionData(ReductionData.getOpcode(), + VectorizedTree, Pair.first, + ReductionData.getKind()); + VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I); } } // Update users. @@ -4583,15 +5499,58 @@ private: Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); - int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); - int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); + int PairwiseRdxCost; + int SplittingRdxCost; + switch (ReductionData.getKind()) { + case RK_Arithmetic: + PairwiseRdxCost = + TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, + /*IsPairwiseForm=*/true); + SplittingRdxCost = + TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, + /*IsPairwiseForm=*/false); + break; + case RK_Min: + case RK_Max: + case RK_UMin: + case RK_UMax: { + Type *VecCondTy = CmpInst::makeCmpResultType(VecTy); + bool IsUnsigned = ReductionData.getKind() == RK_UMin || + ReductionData.getKind() == RK_UMax; + PairwiseRdxCost = + TTI->getMinMaxReductionCost(VecTy, VecCondTy, + /*IsPairwiseForm=*/true, IsUnsigned); + SplittingRdxCost = + TTI->getMinMaxReductionCost(VecTy, VecCondTy, + /*IsPairwiseForm=*/false, IsUnsigned); + break; + } + case RK_None: + llvm_unreachable("Expected arithmetic or min/max reduction operation"); + } IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - int ScalarReduxCost = - (ReduxWidth - 1) * - TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy); + int ScalarReduxCost; + switch (ReductionData.getKind()) { + case RK_Arithmetic: + ScalarReduxCost = + TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + break; + case RK_Min: + case RK_Max: + case RK_UMin: + case RK_UMax: + ScalarReduxCost = + TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy)); + break; + case RK_None: + llvm_unreachable("Expected arithmetic or min/max reduction operation"); + } + ScalarReduxCost *= (ReduxWidth - 1); DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost << " for reduction that starts with " << *FirstReducedVal @@ -4604,16 +5563,15 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth, ArrayRef<Value *> RedOps, - const TargetTransformInfo *TTI) { + unsigned ReduxWidth, const TargetTransformInfo *TTI) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); if (!IsPairwiseReduction) return createSimpleTargetReduction( - Builder, TTI, ReductionOpcode, VectorizedValue, - TargetTransformInfo::ReductionFlags(), RedOps); + Builder, TTI, ReductionData.getOpcode(), VectorizedValue, + ReductionData.getFlags(), ReductionOps.back()); Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { @@ -4627,15 +5585,16 @@ private: Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = - Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); - propagateIRFlags(TmpVec, RedOps); + OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf, + RightShuf, ReductionData.getKind()); + TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps); } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } }; + } // end anonymous namespace /// \brief Recognize construction of vectors like @@ -4643,39 +5602,29 @@ private: /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 +/// starting from the last insertelement instruction. /// /// Returns true if it matches -/// -static bool findBuildVector(InsertElementInst *FirstInsertElem, +static bool findBuildVector(InsertElementInst *LastInsertElem, SmallVectorImpl<Value *> &BuildVector, SmallVectorImpl<Value *> &BuildVectorOpds) { - if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) - return false; - - InsertElementInst *IE = FirstInsertElem; - while (true) { - BuildVector.push_back(IE); - BuildVectorOpds.push_back(IE->getOperand(1)); - - if (IE->use_empty()) - return false; - - InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); - if (!NextUse) - return true; - - // If this isn't the final use, make sure the next insertelement is the only - // use. It's OK if the final constructed vector is used multiple times - if (!IE->hasOneUse()) + Value *V = nullptr; + do { + BuildVector.push_back(LastInsertElem); + BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); + V = LastInsertElem->getOperand(0); + if (isa<UndefValue>(V)) + break; + LastInsertElem = dyn_cast<InsertElementInst>(V); + if (!LastInsertElem || !LastInsertElem->hasOneUse()) return false; - - IE = NextUse; - } - - return false; + } while (true); + std::reverse(BuildVector.begin(), BuildVector.end()); + std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); + return true; } -/// \brief Like findBuildVector, but looks backwards for construction of aggregate. +/// \brief Like findBuildVector, but looks for construction of aggregate. /// /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, @@ -4767,14 +5716,14 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, - const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) { + const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { if (!ShouldVectorizeHor) return false; if (!Root) return false; - if (Root->getParent() != BB) + if (Root->getParent() != BB || isa<PHINode>(Root)) return false; // Start analysis starting from Root instruction. If horizontal reduction is // found, try to vectorize it. If it is not a horizontal reduction or @@ -4795,11 +5744,13 @@ static bool tryToVectorizeHorReductionOrInstOperands( if (!V) continue; auto *Inst = dyn_cast<Instruction>(V); - if (!Inst || isa<PHINode>(Inst)) + if (!Inst) continue; - if (auto *BI = dyn_cast<BinaryOperator>(Inst)) { + auto *BI = dyn_cast<BinaryOperator>(Inst); + auto *SI = dyn_cast<SelectInst>(Inst); + if (BI || SI) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, BI)) { + if (HorRdx.matchAssociativeReduction(P, Inst)) { if (HorRdx.tryToReduce(R, TTI)) { Res = true; // Set P to nullptr to avoid re-analysis of phi node in @@ -4808,7 +5759,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( continue; } } - if (P) { + if (P && BI) { Inst = dyn_cast<Instruction>(BI->getOperand(0)); if (Inst == P) Inst = dyn_cast<Instruction>(BI->getOperand(1)); @@ -4823,15 +5774,20 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; - if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) { + if (Vectorize(Inst, R)) { Res = true; continue; } // Try to vectorize operands. + // Continue analysis for the instruction from the same basic block only to + // save compile time. if (++Level < RecursionMaxDepth) for (auto *Op : Inst->operand_values()) - Stack.emplace_back(Op, Level); + if (VisitedInstrs.insert(Op).second) + if (auto *I = dyn_cast<Instruction>(Op)) + if (!isa<PHINode>(I) && I->getParent() == BB) + Stack.emplace_back(Op, Level); } return Res; } @@ -4848,10 +5804,71 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, if (!isa<BinaryOperator>(I)) P = nullptr; // Try to match and vectorize a horizontal reduction. - return tryToVectorizeHorReductionOrInstOperands( - P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool { - return tryToVectorize(BI, R); - }); + auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { + return tryToVectorize(I, R); + }; + return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, + ExtraVectorization); +} + +bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, + BasicBlock *BB, BoUpSLP &R) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (!R.canMapToVector(IVI->getType(), DL)) + return false; + + SmallVector<Value *, 16> BuildVector; + SmallVector<Value *, 16> BuildVectorOpds; + if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds)) + return false; + + DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); + // Aggregate value is unlikely to be processed in vector register, we need to + // extract scalars into scalar registers, so NeedExtraction is set true. + return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true); +} + +bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, + BasicBlock *BB, BoUpSLP &R) { + SmallVector<Value *, 16> BuildVector; + SmallVector<Value *, 16> BuildVectorOpds; + if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) + return false; + + // Vectorize starting with the build vector operands ignoring the BuildVector + // instructions for the purpose of scheduling and user extraction. + return tryToVectorizeList(BuildVectorOpds, R, BuildVector); +} + +bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, + BoUpSLP &R) { + if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) + return true; + + bool OpsChanged = false; + for (int Idx = 0; Idx < 2; ++Idx) { + OpsChanged |= + vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); + } + return OpsChanged; +} + +bool SLPVectorizerPass::vectorizeSimpleInstructions( + SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) { + bool OpsChanged = false; + for (auto &VH : reverse(Instructions)) { + auto *I = dyn_cast_or_null<Instruction>(VH); + if (!I) + continue; + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); + else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); + else if (auto *CI = dyn_cast<CmpInst>(I)) + OpsChanged |= vectorizeCmpInst(CI, BB, R); + } + Instructions.clear(); + return OpsChanged; } bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { @@ -4913,10 +5930,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { VisitedInstrs.clear(); + SmallVector<WeakVH, 8> PostProcessInstructions; + SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(&*it).second) + if (!VisitedInstrs.insert(&*it).second) { + if (it->use_empty() && KeyNodes.count(&*it) > 0 && + vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + Changed = true; + it = BB->begin(); + e = BB->end(); + } continue; + } if (isa<DbgInfoIntrinsic>(it)) continue; @@ -4938,96 +5966,37 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - if (ShouldStartVectorizeHorAtStore) { - if (StoreInst *SI = dyn_cast<StoreInst>(it)) { - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R, - TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; + // Ran into an instruction without users, like terminator, or function call + // with ignored return value, store. Ignore unused instructions (basing on + // instruction type, except for CallInst and InvokeInst). + if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) || + isa<InvokeInst>(it))) { + KeyNodes.insert(&*it); + bool OpsChanged = false; + if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) { + for (auto *V : it->operand_values()) { + // Try to match and vectorize a horizontal reduction. + OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); } } - } - - // Try to vectorize horizontal reductions feeding into a return. - if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) { - if (RI->getNumOperands() != 0) { - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - } - } - - // Try to vectorize trees that start at compare instructions. - if (CmpInst *CI = dyn_cast<CmpInst>(it)) { - if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { - Changed = true; + // Start vectorization of post-process list of instructions from the + // top-tree instructions to try to vectorize as many instructions as + // possible. + OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R); + if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - continue; - } - - for (int I = 0; I < 2; ++I) { - if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { - Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - break; - } - } - continue; - } - - // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { - SmallVector<Value *, 16> BuildVector; - SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) - continue; - - // Vectorize starting with the build vector operands ignoring the - // BuildVector instructions for the purpose of scheduling and user - // extraction. - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); + continue; } - - continue; } - // Try to vectorize trees that start at insertvalue instructions feeding into - // a store. - if (StoreInst *SI = dyn_cast<StoreInst>(it)) { - if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) { - const DataLayout &DL = BB->getModule()->getDataLayout(); - if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) { - SmallVector<Value *, 16> BuildVector; - SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) - continue; + if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || + isa<InsertValueInst>(it)) + PostProcessInstructions.push_back(&*it); - DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n"); - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - } - continue; - } - } - } } return Changed; @@ -5036,7 +6005,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { auto Changed = false; for (auto &Entry : GEPs) { - // If the getelementptr list has fewer than two elements, there's nothing // to do. if (Entry.second.size() < 2) @@ -5141,7 +6109,9 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { } char SLPVectorizer::ID = 0; + static const char lv_name[] = "SLP Vectorizer"; + INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) @@ -5152,6 +6122,4 @@ INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) -namespace llvm { -Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } -} +Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp new file mode 100644 index 000000000000..4e54fc6db2a5 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -0,0 +1,557 @@ +//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This is the LLVM vectorization plan. It represents a candidate for +/// vectorization, allowing to plan and optimize how to vectorize a given loop +/// before generating LLVM-IR. +/// The vectorizer uses vectorization plans to estimate the costs of potential +/// candidates and if profitable to execute the desired plan, generating vector +/// LLVM-IR code. +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <cassert> +#include <iterator> +#include <string> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "vplan" + +raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { + if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V)) + Instr->print(OS); + else + V.printAsOperand(OS); + return OS; +} + +/// \return the VPBasicBlock that is the entry of Block, possibly indirectly. +const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { + const VPBlockBase *Block = this; + while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) + Block = Region->getEntry(); + return cast<VPBasicBlock>(Block); +} + +VPBasicBlock *VPBlockBase::getEntryBasicBlock() { + VPBlockBase *Block = this; + while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) + Block = Region->getEntry(); + return cast<VPBasicBlock>(Block); +} + +/// \return the VPBasicBlock that is the exit of Block, possibly indirectly. +const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { + const VPBlockBase *Block = this; + while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) + Block = Region->getExit(); + return cast<VPBasicBlock>(Block); +} + +VPBasicBlock *VPBlockBase::getExitBasicBlock() { + VPBlockBase *Block = this; + while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) + Block = Region->getExit(); + return cast<VPBasicBlock>(Block); +} + +VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { + if (!Successors.empty() || !Parent) + return this; + assert(Parent->getExit() == this && + "Block w/o successors not the exit of its parent."); + return Parent->getEnclosingBlockWithSuccessors(); +} + +VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { + if (!Predecessors.empty() || !Parent) + return this; + assert(Parent->getEntry() == this && + "Block w/o predecessors not the entry of its parent."); + return Parent->getEnclosingBlockWithPredecessors(); +} + +void VPBlockBase::deleteCFG(VPBlockBase *Entry) { + SmallVector<VPBlockBase *, 8> Blocks; + for (VPBlockBase *Block : depth_first(Entry)) + Blocks.push_back(Block); + + for (VPBlockBase *Block : Blocks) + delete Block; +} + +BasicBlock * +VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { + // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. + // Pred stands for Predessor. Prev stands for Previous - last visited/created. + BasicBlock *PrevBB = CFG.PrevBB; + BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), + PrevBB->getParent(), CFG.LastBB); + DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); + + // Hook up the new basic block to its predecessors. + for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { + VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); + auto &PredVPSuccessors = PredVPBB->getSuccessors(); + BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; + assert(PredBB && "Predecessor basic-block not found building successor."); + auto *PredBBTerminator = PredBB->getTerminator(); + DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); + if (isa<UnreachableInst>(PredBBTerminator)) { + assert(PredVPSuccessors.size() == 1 && + "Predecessor ending w/o branch must have single successor."); + PredBBTerminator->eraseFromParent(); + BranchInst::Create(NewBB, PredBB); + } else { + assert(PredVPSuccessors.size() == 2 && + "Predecessor ending with branch must have two successors."); + unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; + assert(!PredBBTerminator->getSuccessor(idx) && + "Trying to reset an existing successor block."); + PredBBTerminator->setSuccessor(idx, NewBB); + } + } + return NewBB; +} + +void VPBasicBlock::execute(VPTransformState *State) { + bool Replica = State->Instance && + !(State->Instance->Part == 0 && State->Instance->Lane == 0); + VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; + VPBlockBase *SingleHPred = nullptr; + BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. + + // 1. Create an IR basic block, or reuse the last one if possible. + // The last IR basic block is reused, as an optimization, in three cases: + // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; + // B. when the current VPBB has a single (hierarchical) predecessor which + // is PrevVPBB and the latter has a single (hierarchical) successor; and + // C. when the current VPBB is an entry of a region replica - where PrevVPBB + // is the exit of this region from a previous instance, or the predecessor + // of this region. + if (PrevVPBB && /* A */ + !((SingleHPred = getSingleHierarchicalPredecessor()) && + SingleHPred->getExitBasicBlock() == PrevVPBB && + PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ + !(Replica && getPredecessors().empty())) { /* C */ + NewBB = createEmptyBasicBlock(State->CFG); + State->Builder.SetInsertPoint(NewBB); + // Temporarily terminate with unreachable until CFG is rewired. + UnreachableInst *Terminator = State->Builder.CreateUnreachable(); + State->Builder.SetInsertPoint(Terminator); + // Register NewBB in its loop. In innermost loops its the same for all BB's. + Loop *L = State->LI->getLoopFor(State->CFG.LastBB); + L->addBasicBlockToLoop(NewBB, *State->LI); + State->CFG.PrevBB = NewBB; + } + + // 2. Fill the IR basic block with IR instructions. + DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() + << " in BB:" << NewBB->getName() << '\n'); + + State->CFG.VPBB2IRBB[this] = NewBB; + State->CFG.PrevVPBB = this; + + for (VPRecipeBase &Recipe : Recipes) + Recipe.execute(*State); + + DEBUG(dbgs() << "LV: filled BB:" << *NewBB); +} + +void VPRegionBlock::execute(VPTransformState *State) { + ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); + + if (!isReplicator()) { + // Visit the VPBlocks connected to "this", starting from it. + for (VPBlockBase *Block : RPOT) { + DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + Block->execute(State); + } + return; + } + + assert(!State->Instance && "Replicating a Region with non-null instance."); + + // Enter replicating mode. + State->Instance = {0, 0}; + + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { + State->Instance->Part = Part; + for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { + State->Instance->Lane = Lane; + // Visit the VPBlocks connected to \p this, starting from it. + for (VPBlockBase *Block : RPOT) { + DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + Block->execute(State); + } + } + } + + // Exit replicating mode. + State->Instance.reset(); +} + +void VPInstruction::generateInstruction(VPTransformState &State, + unsigned Part) { + IRBuilder<> &Builder = State.Builder; + + if (Instruction::isBinaryOp(getOpcode())) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); + State.set(this, V, Part); + return; + } + + switch (getOpcode()) { + case VPInstruction::Not: { + Value *A = State.get(getOperand(0), Part); + Value *V = Builder.CreateNot(A); + State.set(this, V, Part); + break; + } + default: + llvm_unreachable("Unsupported opcode for instruction"); + } +} + +void VPInstruction::execute(VPTransformState &State) { + assert(!State.Instance && "VPInstruction executing an Instance"); + for (unsigned Part = 0; Part < State.UF; ++Part) + generateInstruction(State, Part); +} + +void VPInstruction::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"EMIT "; + print(O); + O << "\\l\""; +} + +void VPInstruction::print(raw_ostream &O) const { + printAsOperand(O); + O << " = "; + + switch (getOpcode()) { + case VPInstruction::Not: + O << "not"; + break; + default: + O << Instruction::getOpcodeName(getOpcode()); + } + + for (const VPValue *Operand : operands()) { + O << " "; + Operand->printAsOperand(O); + } +} + +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { + // 0. Set the reverse mapping from VPValues to Values for code generation. + for (auto &Entry : Value2VPValue) + State->VPValue2Value[Entry.second] = Entry.first; + + BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; + BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); + assert(VectorHeaderBB && "Loop preheader does not have a single successor."); + BasicBlock *VectorLatchBB = VectorHeaderBB; + + // 1. Make room to generate basic-blocks inside loop body if needed. + VectorLatchBB = VectorHeaderBB->splitBasicBlock( + VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); + Loop *L = State->LI->getLoopFor(VectorHeaderBB); + L->addBasicBlockToLoop(VectorLatchBB, *State->LI); + // Remove the edge between Header and Latch to allow other connections. + // Temporarily terminate with unreachable until CFG is rewired. + // Note: this asserts the generated code's assumption that + // getFirstInsertionPt() can be dereferenced into an Instruction. + VectorHeaderBB->getTerminator()->eraseFromParent(); + State->Builder.SetInsertPoint(VectorHeaderBB); + UnreachableInst *Terminator = State->Builder.CreateUnreachable(); + State->Builder.SetInsertPoint(Terminator); + + // 2. Generate code in loop body. + State->CFG.PrevVPBB = nullptr; + State->CFG.PrevBB = VectorHeaderBB; + State->CFG.LastBB = VectorLatchBB; + + for (VPBlockBase *Block : depth_first(Entry)) + Block->execute(State); + + // 3. Merge the temporary latch created with the last basic-block filled. + BasicBlock *LastBB = State->CFG.PrevBB; + // Connect LastBB to VectorLatchBB to facilitate their merge. + assert(isa<UnreachableInst>(LastBB->getTerminator()) && + "Expected VPlan CFG to terminate with unreachable"); + LastBB->getTerminator()->eraseFromParent(); + BranchInst::Create(VectorLatchBB, LastBB); + + // Merge LastBB with Latch. + bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); + (void)Merged; + assert(Merged && "Could not merge last basic block with latch."); + VectorLatchBB = LastBB; + + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); +} + +void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, + BasicBlock *LoopLatchBB) { + BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); + assert(LoopHeaderBB && "Loop preheader does not have a single successor."); + DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); + // The vector body may be more than a single basic-block by this point. + // Update the dominator tree information inside the vector body by propagating + // it from header to latch, expecting only triangular control-flow, if any. + BasicBlock *PostDomSucc = nullptr; + for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { + // Get the list of successors of this block. + std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB)); + assert(Succs.size() <= 2 && + "Basic block in vector loop has more than 2 successors."); + PostDomSucc = Succs[0]; + if (Succs.size() == 1) { + assert(PostDomSucc->getSinglePredecessor() && + "PostDom successor has more than one predecessor."); + DT->addNewBlock(PostDomSucc, BB); + continue; + } + BasicBlock *InterimSucc = Succs[1]; + if (PostDomSucc->getSingleSuccessor() == InterimSucc) { + PostDomSucc = Succs[1]; + InterimSucc = Succs[0]; + } + assert(InterimSucc->getSingleSuccessor() == PostDomSucc && + "One successor of a basic block does not lead to the other."); + assert(InterimSucc->getSinglePredecessor() && + "Interim successor has more than one predecessor."); + assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 && + "PostDom successor has more than two predecessors."); + DT->addNewBlock(InterimSucc, BB); + DT->addNewBlock(PostDomSucc, BB); + } +} + +const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { + return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + + Twine(getOrCreateBID(Block)); +} + +const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { + const std::string &Name = Block->getName(); + if (!Name.empty()) + return Name; + return "VPB" + Twine(getOrCreateBID(Block)); +} + +void VPlanPrinter::dump() { + Depth = 1; + bumpIndent(0); + OS << "digraph VPlan {\n"; + OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; + if (!Plan.getName().empty()) + OS << "\\n" << DOT::EscapeString(Plan.getName()); + if (!Plan.Value2VPValue.empty()) { + OS << ", where:"; + for (auto Entry : Plan.Value2VPValue) { + OS << "\\n" << *Entry.second; + OS << DOT::EscapeString(" := "); + Entry.first->printAsOperand(OS, false); + } + } + OS << "\"]\n"; + OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; + OS << "edge [fontname=Courier, fontsize=30]\n"; + OS << "compound=true\n"; + + for (VPBlockBase *Block : depth_first(Plan.getEntry())) + dumpBlock(Block); + + OS << "}\n"; +} + +void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { + if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block)) + dumpBasicBlock(BasicBlock); + else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) + dumpRegion(Region); + else + llvm_unreachable("Unsupported kind of VPBlock."); +} + +void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, + bool Hidden, const Twine &Label) { + // Due to "dot" we print an edge between two regions as an edge between the + // exit basic block and the entry basic of the respective regions. + const VPBlockBase *Tail = From->getExitBasicBlock(); + const VPBlockBase *Head = To->getEntryBasicBlock(); + OS << Indent << getUID(Tail) << " -> " << getUID(Head); + OS << " [ label=\"" << Label << '\"'; + if (Tail != From) + OS << " ltail=" << getUID(From); + if (Head != To) + OS << " lhead=" << getUID(To); + if (Hidden) + OS << "; splines=none"; + OS << "]\n"; +} + +void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { + auto &Successors = Block->getSuccessors(); + if (Successors.size() == 1) + drawEdge(Block, Successors.front(), false, ""); + else if (Successors.size() == 2) { + drawEdge(Block, Successors.front(), false, "T"); + drawEdge(Block, Successors.back(), false, "F"); + } else { + unsigned SuccessorNumber = 0; + for (auto *Successor : Successors) + drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); + } +} + +void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { + OS << Indent << getUID(BasicBlock) << " [label =\n"; + bumpIndent(1); + OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; + bumpIndent(1); + for (const VPRecipeBase &Recipe : *BasicBlock) + Recipe.print(OS, Indent); + bumpIndent(-2); + OS << "\n" << Indent << "]\n"; + dumpEdges(BasicBlock); +} + +void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { + OS << Indent << "subgraph " << getUID(Region) << " {\n"; + bumpIndent(1); + OS << Indent << "fontname=Courier\n" + << Indent << "label=\"" + << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ") + << DOT::EscapeString(Region->getName()) << "\"\n"; + // Dump the blocks of the region. + assert(Region->getEntry() && "Region contains no inner blocks."); + for (const VPBlockBase *Block : depth_first(Region->getEntry())) + dumpBlock(Block); + bumpIndent(-1); + OS << Indent << "}\n"; + dumpEdges(Region); +} + +void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { + std::string IngredientString; + raw_string_ostream RSO(IngredientString); + if (auto *Inst = dyn_cast<Instruction>(V)) { + if (!Inst->getType()->isVoidTy()) { + Inst->printAsOperand(RSO, false); + RSO << " = "; + } + RSO << Inst->getOpcodeName() << " "; + unsigned E = Inst->getNumOperands(); + if (E > 0) { + Inst->getOperand(0)->printAsOperand(RSO, false); + for (unsigned I = 1; I < E; ++I) + Inst->getOperand(I)->printAsOperand(RSO << ", ", false); + } + } else // !Inst + V->printAsOperand(RSO, false); + RSO.flush(); + O << DOT::EscapeString(IngredientString); +} + +void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN\\l\""; + for (auto &Instr : make_range(Begin, End)) + O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\""; +} + +void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, + const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-INDUCTION"; + if (Trunc) { + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\""; + } else + O << " " << VPlanIngredient(IV) << "\\l\""; +} + +void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; +} + +void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"BLEND "; + Phi->printAsOperand(O, false); + O << " ="; + if (!User) { + // Not a User of any mask: not really blending, this is a + // single-predecessor phi. + O << " "; + Phi->getIncomingValue(0)->printAsOperand(O, false); + } else { + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) { + O << " "; + Phi->getIncomingValue(I)->printAsOperand(O, false); + O << "/"; + User->getOperand(I)->printAsOperand(O); + } + } + O << "\\l\""; +} + +void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" + << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") + << VPlanIngredient(Ingredient); + if (AlsoPack) + O << " (S->V)"; + O << "\\l\""; +} + +void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" + << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) + << "\\l\""; +} + +void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, + const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); + if (User) { + O << ", "; + User->getOperand(0)->printAsOperand(O); + } + O << "\\l\""; +} diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h new file mode 100644 index 000000000000..2ccabfd6af25 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlan.h @@ -0,0 +1,1194 @@ +//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file contains the declarations of the Vectorization Plan base classes: +/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual +/// VPBlockBase, together implementing a Hierarchical CFG; +/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be +/// treated as proper graphs for generic algorithms; +/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained +/// within VPBasicBlocks; +/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned +/// instruction; +/// 5. The VPlan class holding a candidate for vectorization; +/// 6. The VPlanPrinter class providing a way to print a plan in dot format; +/// These are documented in docs/VectorizationPlan.rst. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H + +#include "VPlanValue.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/ilist.h" +#include "llvm/ADT/ilist_node.h" +#include "llvm/IR/IRBuilder.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <map> +#include <string> + +// The (re)use of existing LoopVectorize classes is subject to future VPlan +// refactoring. +namespace { +class LoopVectorizationLegality; +class LoopVectorizationCostModel; +} // namespace + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class InnerLoopVectorizer; +class InterleaveGroup; +class LoopInfo; +class raw_ostream; +class Value; +class VPBasicBlock; +class VPRegionBlock; + +/// In what follows, the term "input IR" refers to code that is fed into the +/// vectorizer whereas the term "output IR" refers to code that is generated by +/// the vectorizer. + +/// VPIteration represents a single point in the iteration space of the output +/// (vectorized and/or unrolled) IR loop. +struct VPIteration { + /// in [0..UF) + unsigned Part; + + /// in [0..VF) + unsigned Lane; +}; + +/// This is a helper struct for maintaining vectorization state. It's used for +/// mapping values from the original loop to their corresponding values in +/// the new loop. Two mappings are maintained: one for vectorized values and +/// one for scalarized values. Vectorized values are represented with UF +/// vector values in the new loop, and scalarized values are represented with +/// UF x VF scalar values in the new loop. UF and VF are the unroll and +/// vectorization factors, respectively. +/// +/// Entries can be added to either map with setVectorValue and setScalarValue, +/// which assert that an entry was not already added before. If an entry is to +/// replace an existing one, call resetVectorValue and resetScalarValue. This is +/// currently needed to modify the mapped values during "fix-up" operations that +/// occur once the first phase of widening is complete. These operations include +/// type truncation and the second phase of recurrence widening. +/// +/// Entries from either map can be retrieved using the getVectorValue and +/// getScalarValue functions, which assert that the desired value exists. +struct VectorizerValueMap { + friend struct VPTransformState; + +private: + /// The unroll factor. Each entry in the vector map contains UF vector values. + unsigned UF; + + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + using VectorParts = SmallVector<Value *, 2>; + using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; + std::map<Value *, VectorParts> VectorMapStorage; + std::map<Value *, ScalarParts> ScalarMapStorage; + +public: + /// Construct an empty map with the given unroll and vectorization factors. + VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + + /// \return True if the map has any vector entry for \p Key. + bool hasAnyVectorValue(Value *Key) const { + return VectorMapStorage.count(Key); + } + + /// \return True if the map has a vector entry for \p Key and \p Part. + bool hasVectorValue(Value *Key, unsigned Part) const { + assert(Part < UF && "Queried Vector Part is too large."); + if (!hasAnyVectorValue(Key)) + return false; + const VectorParts &Entry = VectorMapStorage.find(Key)->second; + assert(Entry.size() == UF && "VectorParts has wrong dimensions."); + return Entry[Part] != nullptr; + } + + /// \return True if the map has any scalar entry for \p Key. + bool hasAnyScalarValue(Value *Key) const { + return ScalarMapStorage.count(Key); + } + + /// \return True if the map has a scalar entry for \p Key and \p Instance. + bool hasScalarValue(Value *Key, const VPIteration &Instance) const { + assert(Instance.Part < UF && "Queried Scalar Part is too large."); + assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) + return false; + const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; + assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); + assert(Entry[Instance.Part].size() == VF && + "ScalarParts has wrong dimensions."); + return Entry[Instance.Part][Instance.Lane] != nullptr; + } + + /// Retrieve the existing vector value that corresponds to \p Key and + /// \p Part. + Value *getVectorValue(Value *Key, unsigned Part) { + assert(hasVectorValue(Key, Part) && "Getting non-existent value."); + return VectorMapStorage[Key][Part]; + } + + /// Retrieve the existing scalar value that corresponds to \p Key and + /// \p Instance. + Value *getScalarValue(Value *Key, const VPIteration &Instance) { + assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); + return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; + } + + /// Set a vector value associated with \p Key and \p Part. Assumes such a + /// value is not already set. If it is, use resetVectorValue() instead. + void setVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); + if (!VectorMapStorage.count(Key)) { + VectorParts Entry(UF); + VectorMapStorage[Key] = Entry; + } + VectorMapStorage[Key][Part] = Vector; + } + + /// Set a scalar value associated with \p Key and \p Instance. Assumes such a + /// value is not already set. + void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) { + assert(!hasScalarValue(Key, Instance) && "Scalar value already set"); + if (!ScalarMapStorage.count(Key)) { + ScalarParts Entry(UF); + // TODO: Consider storing uniform values only per-part, as they occupy + // lane 0 only, keeping the other VF-1 redundant entries null. + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part].resize(VF, nullptr); + ScalarMapStorage[Key] = Entry; + } + ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + } + + /// Reset the vector value associated with \p Key for the given \p Part. + /// This function can be used to update values that have already been + /// vectorized. This is the case for "fix-up" operations including type + /// truncation and the second phase of recurrence vectorization. + void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(hasVectorValue(Key, Part) && "Vector value not set for part"); + VectorMapStorage[Key][Part] = Vector; + } + + /// Reset the scalar value associated with \p Key for \p Part and \p Lane. + /// This function can be used to update values that have already been + /// scalarized. This is the case for "fix-up" operations including scalar phi + /// nodes for scalarized and predicated instructions. + void resetScalarValue(Value *Key, const VPIteration &Instance, + Value *Scalar) { + assert(hasScalarValue(Key, Instance) && + "Scalar value not set for part and lane"); + ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + } +}; + +/// This class is used to enable the VPlan to invoke a method of ILV. This is +/// needed until the method is refactored out of ILV and becomes reusable. +struct VPCallback { + virtual ~VPCallback() {} + virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; +}; + +/// VPTransformState holds information passed down when "executing" a VPlan, +/// needed for generating the output IR. +struct VPTransformState { + VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, + IRBuilder<> &Builder, VectorizerValueMap &ValueMap, + InnerLoopVectorizer *ILV, VPCallback &Callback) + : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), + ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} + + /// The chosen Vectorization and Unroll Factors of the loop being vectorized. + unsigned VF; + unsigned UF; + + /// Hold the indices to generate specific scalar instructions. Null indicates + /// that all instances are to be generated, using either scalar or vector + /// instructions. + Optional<VPIteration> Instance; + + struct DataState { + /// A type for vectorized values in the new loop. Each value from the + /// original loop, when vectorized, is represented by UF vector values in + /// the new unrolled loop, where UF is the unroll factor. + typedef SmallVector<Value *, 2> PerPartValuesTy; + + DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; + } Data; + + /// Get the generated Value for a given VPValue and a given Part. Note that + /// as some Defs are still created by ILV and managed in its ValueMap, this + /// method will delegate the call to ILV in such cases in order to provide + /// callers a consistent API. + /// \see set. + Value *get(VPValue *Def, unsigned Part) { + // If Values have been set for this Def return the one relevant for \p Part. + if (Data.PerPartOutput.count(Def)) + return Data.PerPartOutput[Def][Part]; + // Def is managed by ILV: bring the Values from ValueMap. + return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); + } + + /// Set the generated Value for a given VPValue and a given Part. + void set(VPValue *Def, Value *V, unsigned Part) { + if (!Data.PerPartOutput.count(Def)) { + DataState::PerPartValuesTy Entry(UF); + Data.PerPartOutput[Def] = Entry; + } + Data.PerPartOutput[Def][Part] = V; + } + + /// Hold state information used when constructing the CFG of the output IR, + /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. + struct CFGState { + /// The previous VPBasicBlock visited. Initially set to null. + VPBasicBlock *PrevVPBB = nullptr; + + /// The previous IR BasicBlock created or used. Initially set to the new + /// header BasicBlock. + BasicBlock *PrevBB = nullptr; + + /// The last IR BasicBlock in the output IR. Set to the new latch + /// BasicBlock, used for placing the newly created BasicBlocks. + BasicBlock *LastBB = nullptr; + + /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case + /// of replication, maps the BasicBlock of the last replica created. + SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; + + CFGState() = default; + } CFG; + + /// Hold a pointer to LoopInfo to register new basic blocks in the loop. + LoopInfo *LI; + + /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. + DominatorTree *DT; + + /// Hold a reference to the IRBuilder used to generate output IR code. + IRBuilder<> &Builder; + + /// Hold a reference to the Value state information used when generating the + /// Values of the output IR. + VectorizerValueMap &ValueMap; + + /// Hold a reference to a mapping between VPValues in VPlan and original + /// Values they correspond to. + VPValue2ValueTy VPValue2Value; + + /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. + InnerLoopVectorizer *ILV; + + VPCallback &Callback; +}; + +/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. +/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. +class VPBlockBase { +private: + const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + + /// An optional name for the block. + std::string Name; + + /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if + /// it is a topmost VPBlockBase. + VPRegionBlock *Parent = nullptr; + + /// List of predecessor blocks. + SmallVector<VPBlockBase *, 1> Predecessors; + + /// List of successor blocks. + SmallVector<VPBlockBase *, 1> Successors; + + /// Add \p Successor as the last successor to this block. + void appendSuccessor(VPBlockBase *Successor) { + assert(Successor && "Cannot add nullptr successor!"); + Successors.push_back(Successor); + } + + /// Add \p Predecessor as the last predecessor to this block. + void appendPredecessor(VPBlockBase *Predecessor) { + assert(Predecessor && "Cannot add nullptr predecessor!"); + Predecessors.push_back(Predecessor); + } + + /// Remove \p Predecessor from the predecessors of this block. + void removePredecessor(VPBlockBase *Predecessor) { + auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor); + assert(Pos && "Predecessor does not exist"); + Predecessors.erase(Pos); + } + + /// Remove \p Successor from the successors of this block. + void removeSuccessor(VPBlockBase *Successor) { + auto Pos = std::find(Successors.begin(), Successors.end(), Successor); + assert(Pos && "Successor does not exist"); + Successors.erase(Pos); + } + +protected: + VPBlockBase(const unsigned char SC, const std::string &N) + : SubclassID(SC), Name(N) {} + +public: + /// An enumeration for keeping track of the concrete subclass of VPBlockBase + /// that are actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPBlockBase objects. They are used for concrete + /// type identification. + using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC }; + + using VPBlocksTy = SmallVectorImpl<VPBlockBase *>; + + virtual ~VPBlockBase() = default; + + const std::string &getName() const { return Name; } + + void setName(const Twine &newName) { Name = newName.str(); } + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPBlockID() const { return SubclassID; } + + const VPRegionBlock *getParent() const { return Parent; } + + void setParent(VPRegionBlock *P) { Parent = P; } + + /// \return the VPBasicBlock that is the entry of this VPBlockBase, + /// recursively, if the latter is a VPRegionBlock. Otherwise, if this + /// VPBlockBase is a VPBasicBlock, it is returned. + const VPBasicBlock *getEntryBasicBlock() const; + VPBasicBlock *getEntryBasicBlock(); + + /// \return the VPBasicBlock that is the exit of this VPBlockBase, + /// recursively, if the latter is a VPRegionBlock. Otherwise, if this + /// VPBlockBase is a VPBasicBlock, it is returned. + const VPBasicBlock *getExitBasicBlock() const; + VPBasicBlock *getExitBasicBlock(); + + const VPBlocksTy &getSuccessors() const { return Successors; } + VPBlocksTy &getSuccessors() { return Successors; } + + const VPBlocksTy &getPredecessors() const { return Predecessors; } + VPBlocksTy &getPredecessors() { return Predecessors; } + + /// \return the successor of this VPBlockBase if it has a single successor. + /// Otherwise return a null pointer. + VPBlockBase *getSingleSuccessor() const { + return (Successors.size() == 1 ? *Successors.begin() : nullptr); + } + + /// \return the predecessor of this VPBlockBase if it has a single + /// predecessor. Otherwise return a null pointer. + VPBlockBase *getSinglePredecessor() const { + return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); + } + + /// An Enclosing Block of a block B is any block containing B, including B + /// itself. \return the closest enclosing block starting from "this", which + /// has successors. \return the root enclosing block if all enclosing blocks + /// have no successors. + VPBlockBase *getEnclosingBlockWithSuccessors(); + + /// \return the closest enclosing block starting from "this", which has + /// predecessors. \return the root enclosing block if all enclosing blocks + /// have no predecessors. + VPBlockBase *getEnclosingBlockWithPredecessors(); + + /// \return the successors either attached directly to this VPBlockBase or, if + /// this VPBlockBase is the exit block of a VPRegionBlock and has no + /// successors of its own, search recursively for the first enclosing + /// VPRegionBlock that has successors and return them. If no such + /// VPRegionBlock exists, return the (empty) successors of the topmost + /// VPBlockBase reached. + const VPBlocksTy &getHierarchicalSuccessors() { + return getEnclosingBlockWithSuccessors()->getSuccessors(); + } + + /// \return the hierarchical successor of this VPBlockBase if it has a single + /// hierarchical successor. Otherwise return a null pointer. + VPBlockBase *getSingleHierarchicalSuccessor() { + return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); + } + + /// \return the predecessors either attached directly to this VPBlockBase or, + /// if this VPBlockBase is the entry block of a VPRegionBlock and has no + /// predecessors of its own, search recursively for the first enclosing + /// VPRegionBlock that has predecessors and return them. If no such + /// VPRegionBlock exists, return the (empty) predecessors of the topmost + /// VPBlockBase reached. + const VPBlocksTy &getHierarchicalPredecessors() { + return getEnclosingBlockWithPredecessors()->getPredecessors(); + } + + /// \return the hierarchical predecessor of this VPBlockBase if it has a + /// single hierarchical predecessor. Otherwise return a null pointer. + VPBlockBase *getSingleHierarchicalPredecessor() { + return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); + } + + /// Sets a given VPBlockBase \p Successor as the single successor and \return + /// \p Successor. The parent of this Block is copied to be the parent of + /// \p Successor. + VPBlockBase *setOneSuccessor(VPBlockBase *Successor) { + assert(Successors.empty() && "Setting one successor when others exist."); + appendSuccessor(Successor); + Successor->appendPredecessor(this); + Successor->Parent = Parent; + return Successor; + } + + /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two + /// successors. The parent of this Block is copied to be the parent of both + /// \p IfTrue and \p IfFalse. + void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { + assert(Successors.empty() && "Setting two successors when others exist."); + appendSuccessor(IfTrue); + appendSuccessor(IfFalse); + IfTrue->appendPredecessor(this); + IfFalse->appendPredecessor(this); + IfTrue->Parent = Parent; + IfFalse->Parent = Parent; + } + + void disconnectSuccessor(VPBlockBase *Successor) { + assert(Successor && "Successor to disconnect is null."); + removeSuccessor(Successor); + Successor->removePredecessor(this); + } + + /// The method which generates the output IR that correspond to this + /// VPBlockBase, thereby "executing" the VPlan. + virtual void execute(struct VPTransformState *State) = 0; + + /// Delete all blocks reachable from a given VPBlockBase, inclusive. + static void deleteCFG(VPBlockBase *Entry); +}; + +/// VPRecipeBase is a base class modeling a sequence of one or more output IR +/// instructions. +class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { + friend VPBasicBlock; + +private: + const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + + /// Each VPRecipe belongs to a single VPBasicBlock. + VPBasicBlock *Parent = nullptr; + +public: + /// An enumeration for keeping track of the concrete subclass of VPRecipeBase + /// that is actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPRecipeBase objects. They are used for concrete + /// type identification. + using VPRecipeTy = enum { + VPBlendSC, + VPBranchOnMaskSC, + VPInstructionSC, + VPInterleaveSC, + VPPredInstPHISC, + VPReplicateSC, + VPWidenIntOrFpInductionSC, + VPWidenMemoryInstructionSC, + VPWidenPHISC, + VPWidenSC, + }; + + VPRecipeBase(const unsigned char SC) : SubclassID(SC) {} + virtual ~VPRecipeBase() = default; + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPRecipeID() const { return SubclassID; } + + /// \return the VPBasicBlock which this VPRecipe belongs to. + VPBasicBlock *getParent() { return Parent; } + const VPBasicBlock *getParent() const { return Parent; } + + /// The method which generates the output IR instructions that correspond to + /// this VPRecipe, thereby "executing" the VPlan. + virtual void execute(struct VPTransformState &State) = 0; + + /// Each recipe prints itself. + virtual void print(raw_ostream &O, const Twine &Indent) const = 0; +}; + +/// This is a concrete Recipe that models a single VPlan-level instruction. +/// While as any Recipe it may generate a sequence of IR instructions when +/// executed, these instructions would always form a single-def expression as +/// the VPInstruction is also a single def-use vertex. +class VPInstruction : public VPUser, public VPRecipeBase { +public: + /// VPlan opcodes, extending LLVM IR with idiomatics instructions. + enum { Not = Instruction::OtherOpsEnd + 1 }; + +private: + typedef unsigned char OpcodeTy; + OpcodeTy Opcode; + + /// Utility method serving execute(): generates a single instance of the + /// modeled instruction. + void generateInstruction(VPTransformState &State, unsigned Part); + +public: + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) + : VPUser(VPValue::VPInstructionSC, Operands), + VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPInstructionSC; + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *R) { + return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; + } + + unsigned getOpcode() const { return Opcode; } + + /// Generate the instruction. + /// TODO: We currently execute only per-part unless a specific instance is + /// provided. + void execute(VPTransformState &State) override; + + /// Print the Recipe. + void print(raw_ostream &O, const Twine &Indent) const override; + + /// Print the VPInstruction. + void print(raw_ostream &O) const; +}; + +/// VPWidenRecipe is a recipe for producing a copy of vector type for each +/// Instruction in its ingredients independently, in order. This recipe covers +/// most of the traditional vectorization cases where each ingredient transforms +/// into a vectorized version of itself. +class VPWidenRecipe : public VPRecipeBase { +private: + /// Hold the ingredients by pointing to their original BasicBlock location. + BasicBlock::iterator Begin; + BasicBlock::iterator End; + +public: + VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { + End = I->getIterator(); + Begin = End++; + } + + ~VPWidenRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; + } + + /// Produce widened copies of all Ingredients. + void execute(VPTransformState &State) override; + + /// Augment the recipe to include Instr, if it lies at its End. + bool appendInstruction(Instruction *Instr) { + if (End != Instr->getIterator()) + return false; + End++; + return true; + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for handling phi nodes of integer and floating-point inductions, +/// producing their vector and scalar values. +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +private: + PHINode *IV; + TruncInst *Trunc; + +public: + VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} + ~VPWidenIntOrFpInductionRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; + } + + /// Generate the vectorized and scalarized versions of the phi node as + /// needed by their users. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for handling all phi nodes except for integer and FP inductions. +class VPWidenPHIRecipe : public VPRecipeBase { +private: + PHINode *Phi; + +public: + VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} + ~VPWidenPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for vectorizing a phi-node as a sequence of mask-based select +/// instructions. +class VPBlendRecipe : public VPRecipeBase { +private: + PHINode *Phi; + + /// The blend operation is a User of a mask, if not null. + std::unique_ptr<VPUser> User; + +public: + VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks) + : VPRecipeBase(VPBlendSC), Phi(Phi) { + assert((Phi->getNumIncomingValues() == 1 || + Phi->getNumIncomingValues() == Masks.size()) && + "Expected the same number of incoming values and masks"); + if (!Masks.empty()) + User.reset(new VPUser(Masks)); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPBlendSC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. +class VPInterleaveRecipe : public VPRecipeBase { +private: + const InterleaveGroup *IG; + +public: + VPInterleaveRecipe(const InterleaveGroup *IG) + : VPRecipeBase(VPInterleaveSC), IG(IG) {} + ~VPInterleaveRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; + } + + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; + + const InterleaveGroup *getInterleaveGroup() { return IG; } +}; + +/// VPReplicateRecipe replicates a given instruction producing multiple scalar +/// copies of the original scalar type, one per lane, instead of producing a +/// single copy of widened type for all lanes. If the instruction is known to be +/// uniform only one copy, per lane zero, will be generated. +class VPReplicateRecipe : public VPRecipeBase { +private: + /// The instruction being replicated. + Instruction *Ingredient; + + /// Indicator if only a single replica per lane is needed. + bool IsUniform; + + /// Indicator if the replicas are also predicated. + bool IsPredicated; + + /// Indicator if the scalar values should also be packed into a vector. + bool AlsoPack; + +public: + VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) + : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), + IsPredicated(IsPredicated) { + // Retain the previous behavior of predicateInstructions(), where an + // insert-element of a predicated instruction got hoisted into the + // predicated basic block iff it was its only user. This is achieved by + // having predicated instructions also pack their values into a vector by + // default unless they have a replicated user which uses their scalar value. + AlsoPack = IsPredicated && !I->use_empty(); + } + + ~VPReplicateRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; + } + + /// Generate replicas of the desired Ingredient. Replicas will be generated + /// for all parts and lanes unless a specific part and lane are specified in + /// the \p State. + void execute(VPTransformState &State) override; + + void setAlsoPack(bool Pack) { AlsoPack = Pack; } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for generating conditional branches on the bits of a mask. +class VPBranchOnMaskRecipe : public VPRecipeBase { +private: + std::unique_ptr<VPUser> User; + +public: + VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { + if (BlockInMask) // nullptr means all-one mask. + User.reset(new VPUser({BlockInMask})); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; + } + + /// Generate the extraction of the appropriate bit from the block mask and the + /// conditional branch. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"BRANCH-ON-MASK "; + if (User) + O << *User->getOperand(0); + else + O << " All-One"; + O << "\\l\""; + } +}; + +/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when +/// control converges back from a Branch-on-Mask. The phi nodes are needed in +/// order to merge values that are set under such a branch and feed their uses. +/// The phi nodes can be scalar or vector depending on the users of the value. +/// This recipe works in concert with VPBranchOnMaskRecipe. +class VPPredInstPHIRecipe : public VPRecipeBase { +private: + Instruction *PredInst; + +public: + /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi + /// nodes after merging back from a Branch-on-Mask. + VPPredInstPHIRecipe(Instruction *PredInst) + : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} + ~VPPredInstPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; + } + + /// Generates phi nodes for live-outs as needed to retain SSA form. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A Recipe for widening load/store operations. +/// TODO: We currently execute only per-part unless a specific instance is +/// provided. +class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +private: + Instruction &Instr; + std::unique_ptr<VPUser> User; + +public: + VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { + if (Mask) // Create a VPInstruction to register as a user of the mask. + User.reset(new VPUser({Mask})); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; + } + + /// Generate the wide load/store. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It +/// holds a sequence of zero or more VPRecipe's each representing a sequence of +/// output IR instructions. +class VPBasicBlock : public VPBlockBase { +public: + using RecipeListTy = iplist<VPRecipeBase>; + +private: + /// The VPRecipes held in the order of output instructions to generate. + RecipeListTy Recipes; + +public: + VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) + : VPBlockBase(VPBasicBlockSC, Name.str()) { + if (Recipe) + appendRecipe(Recipe); + } + + ~VPBasicBlock() override { Recipes.clear(); } + + /// Instruction iterators... + using iterator = RecipeListTy::iterator; + using const_iterator = RecipeListTy::const_iterator; + using reverse_iterator = RecipeListTy::reverse_iterator; + using const_reverse_iterator = RecipeListTy::const_reverse_iterator; + + //===--------------------------------------------------------------------===// + /// Recipe iterator methods + /// + inline iterator begin() { return Recipes.begin(); } + inline const_iterator begin() const { return Recipes.begin(); } + inline iterator end() { return Recipes.end(); } + inline const_iterator end() const { return Recipes.end(); } + + inline reverse_iterator rbegin() { return Recipes.rbegin(); } + inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } + inline reverse_iterator rend() { return Recipes.rend(); } + inline const_reverse_iterator rend() const { return Recipes.rend(); } + + inline size_t size() const { return Recipes.size(); } + inline bool empty() const { return Recipes.empty(); } + inline const VPRecipeBase &front() const { return Recipes.front(); } + inline VPRecipeBase &front() { return Recipes.front(); } + inline const VPRecipeBase &back() const { return Recipes.back(); } + inline VPRecipeBase &back() { return Recipes.back(); } + + /// \brief Returns a pointer to a member of the recipe list. + static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { + return &VPBasicBlock::Recipes; + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPBlockBase *V) { + return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC; + } + + void insert(VPRecipeBase *Recipe, iterator InsertPt) { + assert(Recipe && "No recipe to append."); + assert(!Recipe->Parent && "Recipe already in VPlan"); + Recipe->Parent = this; + Recipes.insert(InsertPt, Recipe); + } + + /// Augment the existing recipes of a VPBasicBlock with an additional + /// \p Recipe as the last recipe. + void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); } + + /// The method which generates the output IR instructions that correspond to + /// this VPBasicBlock, thereby "executing" the VPlan. + void execute(struct VPTransformState *State) override; + +private: + /// Create an IR BasicBlock to hold the output instructions generated by this + /// VPBasicBlock, and return it. Update the CFGState accordingly. + BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); +}; + +/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks +/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG. +/// A VPRegionBlock may indicate that its contents are to be replicated several +/// times. This is designed to support predicated scalarization, in which a +/// scalar if-then code structure needs to be generated VF * UF times. Having +/// this replication indicator helps to keep a single model for multiple +/// candidate VF's. The actual replication takes place only once the desired VF +/// and UF have been determined. +class VPRegionBlock : public VPBlockBase { +private: + /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. + VPBlockBase *Entry; + + /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock. + VPBlockBase *Exit; + + /// An indicator whether this region is to generate multiple replicated + /// instances of output IR corresponding to its VPBlockBases. + bool IsReplicator; + +public: + VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit, + const std::string &Name = "", bool IsReplicator = false) + : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit), + IsReplicator(IsReplicator) { + assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); + assert(Exit->getSuccessors().empty() && "Exit block has successors."); + Entry->setParent(this); + Exit->setParent(this); + } + + ~VPRegionBlock() override { + if (Entry) + deleteCFG(Entry); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPBlockBase *V) { + return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; + } + + const VPBlockBase *getEntry() const { return Entry; } + VPBlockBase *getEntry() { return Entry; } + + const VPBlockBase *getExit() const { return Exit; } + VPBlockBase *getExit() { return Exit; } + + /// An indicator whether this region is to generate multiple replicated + /// instances of output IR corresponding to its VPBlockBases. + bool isReplicator() const { return IsReplicator; } + + /// The method which generates the output IR instructions that correspond to + /// this VPRegionBlock, thereby "executing" the VPlan. + void execute(struct VPTransformState *State) override; +}; + +/// VPlan models a candidate for vectorization, encoding various decisions take +/// to produce efficient output IR, including which branches, basic-blocks and +/// output IR instructions to generate, and their cost. VPlan holds a +/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry +/// VPBlock. +class VPlan { + friend class VPlanPrinter; + +private: + /// Hold the single entry to the Hierarchical CFG of the VPlan. + VPBlockBase *Entry; + + /// Holds the VFs applicable to this VPlan. + SmallSet<unsigned, 2> VFs; + + /// Holds the name of the VPlan, for printing. + std::string Name; + + /// Holds a mapping between Values and their corresponding VPValue inside + /// VPlan. + Value2VPValueTy Value2VPValue; + +public: + VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {} + + ~VPlan() { + if (Entry) + VPBlockBase::deleteCFG(Entry); + for (auto &MapEntry : Value2VPValue) + delete MapEntry.second; + } + + /// Generate the IR code for this VPlan. + void execute(struct VPTransformState *State); + + VPBlockBase *getEntry() { return Entry; } + const VPBlockBase *getEntry() const { return Entry; } + + VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; } + + void addVF(unsigned VF) { VFs.insert(VF); } + + bool hasVF(unsigned VF) { return VFs.count(VF); } + + const std::string &getName() const { return Name; } + + void setName(const Twine &newName) { Name = newName.str(); } + + void addVPValue(Value *V) { + assert(V && "Trying to add a null Value to VPlan"); + assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); + Value2VPValue[V] = new VPValue(); + } + + VPValue *getVPValue(Value *V) { + assert(V && "Trying to get the VPValue of a null Value"); + assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); + return Value2VPValue[V]; + } + +private: + /// Add to the given dominator tree the header block and every new basic block + /// that was created between it and the latch block, inclusive. + static void updateDominatorTree(DominatorTree *DT, + BasicBlock *LoopPreHeaderBB, + BasicBlock *LoopLatchBB); +}; + +/// VPlanPrinter prints a given VPlan to a given output stream. The printing is +/// indented and follows the dot format. +class VPlanPrinter { + friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); + friend inline raw_ostream &operator<<(raw_ostream &OS, + const struct VPlanIngredient &I); + +private: + raw_ostream &OS; + VPlan &Plan; + unsigned Depth; + unsigned TabWidth = 2; + std::string Indent; + unsigned BID = 0; + SmallDenseMap<const VPBlockBase *, unsigned> BlockID; + + VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} + + /// Handle indentation. + void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } + + /// Print a given \p Block of the Plan. + void dumpBlock(const VPBlockBase *Block); + + /// Print the information related to the CFG edges going out of a given + /// \p Block, followed by printing the successor blocks themselves. + void dumpEdges(const VPBlockBase *Block); + + /// Print a given \p BasicBlock, including its VPRecipes, followed by printing + /// its successor blocks. + void dumpBasicBlock(const VPBasicBlock *BasicBlock); + + /// Print a given \p Region of the Plan. + void dumpRegion(const VPRegionBlock *Region); + + unsigned getOrCreateBID(const VPBlockBase *Block) { + return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; + } + + const Twine getOrCreateName(const VPBlockBase *Block); + + const Twine getUID(const VPBlockBase *Block); + + /// Print the information related to a CFG edge between two VPBlockBases. + void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, + const Twine &Label); + + void dump(); + + static void printAsIngredient(raw_ostream &O, Value *V); +}; + +struct VPlanIngredient { + Value *V; + + VPlanIngredient(Value *V) : V(V) {} +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { + VPlanPrinter::printAsIngredient(OS, I.V); + return OS; +} + +inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { + VPlanPrinter Printer(OS, Plan); + Printer.dump(); + return OS; +} + +//===--------------------------------------------------------------------===// +// GraphTraits specializations for VPlan/VPRegionBlock Control-Flow Graphs // +//===--------------------------------------------------------------------===// + +// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a +// graph of VPBlockBase nodes... + +template <> struct GraphTraits<VPBlockBase *> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +template <> struct GraphTraits<const VPBlockBase *> { + using NodeRef = const VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a +// graph of VPBlockBase nodes... and to walk it in inverse order. Inverse order +// for a VPBlockBase is considered to be when traversing the predecessors of a +// VPBlockBase instead of its successors. +template <> struct GraphTraits<Inverse<VPBlockBase *>> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static Inverse<VPBlockBase *> getEntryNode(Inverse<VPBlockBase *> B) { + return B; + } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getPredecessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getPredecessors().end(); + } +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/lib/Transforms/Vectorize/VPlanBuilder.h b/lib/Transforms/Vectorize/VPlanBuilder.h new file mode 100644 index 000000000000..d6eb3397d044 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanBuilder.h @@ -0,0 +1,61 @@ +//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a VPlan-based builder utility analogous to IRBuilder. +/// It provides an instruction-level API for generating VPInstructions while +/// abstracting away the Recipe manipulation details. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H + +#include "VPlan.h" + +namespace llvm { + +class VPBuilder { +private: + VPBasicBlock *BB = nullptr; + VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); + + VPInstruction *createInstruction(unsigned Opcode, + std::initializer_list<VPValue *> Operands) { + VPInstruction *Instr = new VPInstruction(Opcode, Operands); + BB->insert(Instr, InsertPt); + return Instr; + } + +public: + VPBuilder() {} + + /// \brief This specifies that created VPInstructions should be appended to + /// the end of the specified block. + void setInsertPoint(VPBasicBlock *TheBB) { + assert(TheBB && "Attempting to set a null insert point"); + BB = TheBB; + InsertPt = BB->end(); + } + + VPValue *createNot(VPValue *Operand) { + return createInstruction(VPInstruction::Not, {Operand}); + } + + VPValue *createAnd(VPValue *LHS, VPValue *RHS) { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); + } + + VPValue *createOr(VPValue *LHS, VPValue *RHS) { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); + } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h new file mode 100644 index 000000000000..50966891e0eb --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanValue.h @@ -0,0 +1,146 @@ +//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declarations of the entities induced by Vectorization +/// Plans, e.g. the instructions the VPlan intends to generate if executed. +/// VPlan models the following entities: +/// VPValue +/// |-- VPUser +/// | |-- VPInstruction +/// These are documented in docs/VectorizationPlan.rst. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +// Forward declarations. +class VPUser; + +// This is the base class of the VPlan Def/Use graph, used for modeling the data +// flow into, within and out of the VPlan. VPValues can stand for live-ins +// coming from the input IR, instructions which VPlan will generate if executed +// and live-outs which the VPlan will need to fix accordingly. +class VPValue { +private: + const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + + SmallVector<VPUser *, 1> Users; + +protected: + VPValue(const unsigned char SC) : SubclassID(SC) {} + +public: + /// An enumeration for keeping track of the concrete subclass of VPValue that + /// are actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPValue objects. They are used for concrete + /// type identification. + enum { VPValueSC, VPUserSC, VPInstructionSC }; + + VPValue() : SubclassID(VPValueSC) {} + VPValue(const VPValue &) = delete; + VPValue &operator=(const VPValue &) = delete; + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPValueID() const { return SubclassID; } + + void printAsOperand(raw_ostream &OS) const { + OS << "%vp" << (unsigned short)(unsigned long long)this; + } + + unsigned getNumUsers() const { return Users.size(); } + void addUser(VPUser &User) { Users.push_back(&User); } + + typedef SmallVectorImpl<VPUser *>::iterator user_iterator; + typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator; + typedef iterator_range<user_iterator> user_range; + typedef iterator_range<const_user_iterator> const_user_range; + + user_iterator user_begin() { return Users.begin(); } + const_user_iterator user_begin() const { return Users.begin(); } + user_iterator user_end() { return Users.end(); } + const_user_iterator user_end() const { return Users.end(); } + user_range users() { return user_range(user_begin(), user_end()); } + const_user_range users() const { + return const_user_range(user_begin(), user_end()); + } +}; + +typedef DenseMap<Value *, VPValue *> Value2VPValueTy; +typedef DenseMap<VPValue *, Value *> VPValue2ValueTy; + +raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); + +/// This class augments VPValue with operands which provide the inverse def-use +/// edges from VPValue's users to their defs. +class VPUser : public VPValue { +private: + SmallVector<VPValue *, 2> Operands; + + void addOperand(VPValue *Operand) { + Operands.push_back(Operand); + Operand->addUser(*this); + } + +protected: + VPUser(const unsigned char SC) : VPValue(SC) {} + VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) { + for (VPValue *Operand : Operands) + addOperand(Operand); + } + +public: + VPUser() : VPValue(VPValue::VPUserSC) {} + VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {} + VPUser(std::initializer_list<VPValue *> Operands) + : VPUser(ArrayRef<VPValue *>(Operands)) {} + VPUser(const VPUser &) = delete; + VPUser &operator=(const VPUser &) = delete; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPValue *V) { + return V->getVPValueID() >= VPUserSC && + V->getVPValueID() <= VPInstructionSC; + } + + unsigned getNumOperands() const { return Operands.size(); } + inline VPValue *getOperand(unsigned N) const { + assert(N < Operands.size() && "Operand index out of bounds"); + return Operands[N]; + } + + typedef SmallVectorImpl<VPValue *>::iterator operand_iterator; + typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator; + typedef iterator_range<operand_iterator> operand_range; + typedef iterator_range<const_operand_iterator> const_operand_range; + + operand_iterator op_begin() { return Operands.begin(); } + const_operand_iterator op_begin() const { return Operands.begin(); } + operand_iterator op_end() { return Operands.end(); } + const_operand_iterator op_end() const { return Operands.end(); } + operand_range operands() { return operand_range(op_begin(), op_end()); } + const_operand_range operands() const { + return const_operand_range(op_begin(), op_end()); + } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index fb2f509dcbaa..b04905bfc6fa 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -18,7 +18,6 @@ #include "llvm-c/Transforms/Vectorize.h" #include "llvm/Analysis/Passes.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" using namespace llvm; |
